diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -129,10 +129,10 @@ "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" >; -def FeatureDoesNotSupportXNACK : SubtargetFeature<"no-xnack-support", - "DoesNotSupportXNACK", +def FeatureSupportsXNACK : SubtargetFeature<"xnack-support", + "SupportsXNACK", "true", - "Hardware does not support XNACK" + "Hardware supports XNACK" >; // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support @@ -491,16 +491,16 @@ [FeatureFlatGlobalInsts] >; -def FeatureDoesNotSupportSRAMECC : SubtargetFeature<"no-sram-ecc-support", - "DoesNotSupportSRAMECC", +def FeatureSupportsSRAMECC : SubtargetFeature<"sramecc-support", + "SupportsSRAMECC", "true", - "Hardware does not support SRAM ECC" + "Hardware supports SRAMECC" >; -def FeatureSRAMECC : SubtargetFeature<"sram-ecc", +def FeatureSRAMECC : SubtargetFeature<"sramecc", "EnableSRAMECC", "true", - "Enable SRAM ECC" + "Enable SRAMECC" >; def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx", @@ -675,8 +675,7 @@ [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel, - FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC, - FeatureDoesNotSupportXNACK] + FeatureTrigReducedRange] >; def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", @@ -685,8 +684,7 @@ FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, - FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC, - FeatureUnalignedBufferAccess] + FeatureDsSrc2Insts, FeatureUnalignedBufferAccess] >; def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", @@ -699,8 +697,7 @@ FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP, FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, - FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC, FeatureFastDenormalF32, - FeatureUnalignedBufferAccess + FeatureDsSrc2Insts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess ] >; @@ -718,7 +715,7 @@ FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess, - FeatureUnalignedDSAccess + FeatureUnalignedDSAccess, FeatureSupportsXNACK ] >; @@ -735,7 +732,7 @@ FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts, FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, FeatureVOP3Literal, FeatureDPP8, - FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC, + FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess ] @@ -748,84 +745,72 @@ def FeatureISAVersion6_0_0 : FeatureSet<[FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops, - FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount32]>; def FeatureISAVersion6_0_1 : FeatureSet< [FeatureSouthernIslands, - FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount32]>; def FeatureISAVersion6_0_2 : FeatureSet< [FeatureSouthernIslands, - FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount32]>; def FeatureISAVersion7_0_0 : FeatureSet< [FeatureSeaIslands, - FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount32]>; def FeatureISAVersion7_0_1 : FeatureSet< [FeatureSeaIslands, HalfRate64Ops, FeatureLDSBankCount32, - FeatureFastFMAF32, - FeatureDoesNotSupportXNACK]>; + FeatureFastFMAF32]>; def FeatureISAVersion7_0_2 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount16, - FeatureFastFMAF32, - FeatureDoesNotSupportXNACK]>; + FeatureFastFMAF32]>; def FeatureISAVersion7_0_3 : FeatureSet< [FeatureSeaIslands, - FeatureLDSBankCount16, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount16]>; def FeatureISAVersion7_0_4 : FeatureSet< [FeatureSeaIslands, - FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount32]>; def FeatureISAVersion7_0_5 : FeatureSet< [FeatureSeaIslands, - FeatureLDSBankCount16, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount16]>; def FeatureISAVersion8_0_1 : FeatureSet< [FeatureVolcanicIslands, FeatureFastFMAF32, HalfRate64Ops, FeatureLDSBankCount32, - FeatureXNACK, + FeatureSupportsXNACK, FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_2 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount32, FeatureSGPRInitBug, - FeatureUnpackedD16VMem, - FeatureDoesNotSupportXNACK]>; + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_3 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount32, - FeatureUnpackedD16VMem, - FeatureDoesNotSupportXNACK]>; + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_5 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount32, FeatureSGPRInitBug, - FeatureUnpackedD16VMem, - FeatureDoesNotSupportXNACK]>; + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_1_0 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount16, - FeatureXNACK, + FeatureSupportsXNACK, FeatureImageStoreD16Bug, FeatureImageGather4D16Bug]>; @@ -833,24 +818,18 @@ [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK, - FeatureDoesNotSupportSRAMECC, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_2 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureXNACK, - FeatureDoesNotSupportSRAMECC, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_4 : FeatureSet< [FeatureGFX9, FeatureLDSBankCount32, FeatureFmaMixInsts, - FeatureDoesNotSupportXNACK, - FeatureDoesNotSupportSRAMECC, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_6 : FeatureSet< @@ -861,7 +840,7 @@ FeatureDLInsts, FeatureDot1Insts, FeatureDot2Insts, - FeatureDoesNotSupportXNACK, + FeatureSupportsSRAMECC, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_8 : FeatureSet< @@ -879,15 +858,14 @@ FeatureMAIInsts, FeaturePkFmacF16Inst, FeatureAtomicFaddInsts, - FeatureSRAMECC, FeatureMFMAInlineLiteralBug, + FeatureSupportsSRAMECC, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_9 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureXNACK, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_C : FeatureSet< @@ -928,7 +906,7 @@ FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureDoesNotSupportXNACK])>; + FeatureSupportsXNACK])>; def FeatureISAVersion10_1_1 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -949,7 +927,7 @@ FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureDoesNotSupportXNACK])>; + FeatureSupportsXNACK])>; def FeatureISAVersion10_1_2 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -970,7 +948,7 @@ FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureDoesNotSupportXNACK])>; + FeatureSupportsXNACK])>; def FeatureISAVersion10_3_0 : FeatureSet< [FeatureGFX10, @@ -983,8 +961,7 @@ FeatureDot5Insts, FeatureDot6Insts, FeatureNSAEncoding, - FeatureWavefrontSize32, - FeatureDoesNotSupportXNACK]>; + FeatureWavefrontSize32]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -300,6 +300,8 @@ std::unique_ptr Legalizer; std::unique_ptr RegBankInfo; + Optional TargetID; + protected: // Basic subtarget description. Triple TargetTriple; @@ -319,8 +321,12 @@ bool UnalignedScratchAccess; bool UnalignedAccessMode; bool HasApertureRegs; + bool SupportsXNACK; + + // This should not be used directly. 'TargetID' tracks the dynamic settings + // for XNACK. bool EnableXNACK; - bool DoesNotSupportXNACK; + bool EnableCuMode; bool TrapHandler; @@ -374,8 +380,12 @@ bool HasMAIInsts; bool HasPkFmacF16Inst; bool HasAtomicFaddInsts; + bool SupportsSRAMECC; + + // This should not be used directly. 'TargetID' tracks the dynamic settings + // for SRAMECC. bool EnableSRAMECC; - bool DoesNotSupportSRAMECC; + bool HasNoSdstCMPX; bool HasVscnt; bool HasGetWaveIdInst; @@ -469,6 +479,11 @@ return RegBankInfo.get(); } + const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { + assert(TargetID.hasValue() && "TargetID has not be initialized"); + return *TargetID; + } + // Nothing implemented, just prevent crashes on use. const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { return &TSInfo; @@ -731,7 +746,7 @@ } bool isXNACKEnabled() const { - return EnableXNACK; + return getTargetID().isXnackOnOrAny(); } bool isCuModeEnabled() const { @@ -794,7 +809,7 @@ } bool d16PreservesUnusedBits() const { - return hasD16LoadStore() && !isSRAMECCEnabled(); + return hasD16LoadStore() && !getTargetID().isSramEccOnOrAny(); } bool hasD16Images() const { @@ -902,10 +917,6 @@ return HasAtomicFaddInsts; } - bool isSRAMECCEnabled() const { - return EnableSRAMECC; - } - bool hasNoSdstCMPX() const { return HasNoSdstCMPX; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -13,18 +13,19 @@ #include "AMDGPUSubtarget.h" #include "AMDGPU.h" -#include "AMDGPUTargetMachine.h" #include "AMDGPUCallLowering.h" #include "AMDGPUInstructionSelector.h" #include "AMDGPULegalizerInfo.h" #include "AMDGPURegisterBankInfo.h" -#include "SIMachineFunctionInfo.h" +#include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineScheduler.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/IR/MDBuilder.h" #include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/MC/MCSubtargetInfo.h" #include using namespace llvm; @@ -85,9 +86,9 @@ // // Similarly we want enable-prt-strict-null to be on by default and not to // unset everything else if it is disabled + TargetID.emplace(*this); - // Assuming ECC is enabled is the conservative default. - SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); + SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; @@ -140,20 +141,12 @@ HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; - // Disable XNACK on targets where it is not enabled by default unless it is - // explicitly requested. - if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { - ToggleFeature(AMDGPU::FeatureXNACK); - EnableXNACK = false; - } + TargetID->setTargetIDFromFeaturesString(FS); - // ECC is on by default, but turn it off if the hardware doesn't support it - // anyway. This matters for the gfx9 targets with d16 loads, but don't support - // ECC. - if (DoesNotSupportSRAMECC && EnableSRAMECC) { - ToggleFeature(AMDGPU::FeatureSRAMECC); - EnableSRAMECC = false; - } + LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " + << TargetID->getXnackSetting() << '\n'); + LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " + << TargetID->getSramEccSetting() << '\n'); return *this; } @@ -197,8 +190,8 @@ UnalignedAccessMode(false), HasApertureRegs(false), + SupportsXNACK(false), EnableXNACK(false), - DoesNotSupportXNACK(false), EnableCuMode(false), TrapHandler(false), @@ -247,8 +240,8 @@ HasMAIInsts(false), HasPkFmacF16Inst(false), HasAtomicFaddInsts(false), + SupportsSRAMECC(false), EnableSRAMECC(false), - DoesNotSupportSRAMECC(false), HasNoSdstCMPX(false), HasVscnt(false), HasGetWaveIdInst(false), diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -19,6 +19,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetParser.h" +#include "llvm/Support/raw_ostream.h" #include #include #include @@ -70,6 +71,84 @@ TRAP_NUM_SGPRS = 16 }; +enum class TargetIDSetting { + Unsupported, + Any, + Off, + On +}; + +class AMDGPUTargetID { +private: + TargetIDSetting XnackSetting; + TargetIDSetting SramEccSetting; + +public: + explicit AMDGPUTargetID(const MCSubtargetInfo &STI); + ~AMDGPUTargetID() = default; + + /// \return True if the current xnack setting is not "Unsupported". + bool isXnackSupported() const { + return XnackSetting != TargetIDSetting::Unsupported; + } + + /// \returns True if the current xnack setting is "On" or "Any". + bool isXnackOnOrAny() const { + return XnackSetting == TargetIDSetting::On || + XnackSetting == TargetIDSetting::Any; + } + + /// \returns True if current xnack setting is "On" or "Off", + /// false otherwise. + bool isXnackOnOrOff() const { + return getXnackSetting() == TargetIDSetting::On || + getXnackSetting() == TargetIDSetting::Off; + } + + /// \returns The current xnack TargetIDSetting, possible options are + /// "Unsupported", "Any", "Off", and "On". + TargetIDSetting getXnackSetting() const { + return XnackSetting; + } + + /// Sets xnack setting to \p NewXnackSetting. + void setXnackSetting(TargetIDSetting NewXnackSetting) { + XnackSetting = NewXnackSetting; + } + + /// \return True if the current sramecc setting is not "Unsupported". + bool isSramEccSupported() const { + return SramEccSetting != TargetIDSetting::Unsupported; + } + + /// \returns True if the current sramecc setting is "On" or "Any". + bool isSramEccOnOrAny() const { + return SramEccSetting == TargetIDSetting::On || + SramEccSetting == TargetIDSetting::Any; + } + + /// \returns True if current sramecc setting is "On" or "Off", + /// false otherwise. + bool isSramEccOnOrOff() const { + return getSramEccSetting() == TargetIDSetting::On || + getSramEccSetting() == TargetIDSetting::Off; + } + + /// \returns The current sramecc TargetIDSetting, possible options are + /// "Unsupported", "Any", "Off", and "On". + TargetIDSetting getSramEccSetting() const { + return SramEccSetting; + } + + /// Sets sramecc setting to \p NewSramEccSetting. + void setSramEccSetting(TargetIDSetting NewSramEccSetting) { + SramEccSetting = NewSramEccSetting; + } + + void setTargetIDFromFeaturesString(StringRef FS); + void setTargetIDFromTargetIDStream(StringRef TargetID); +}; + /// Streams isa version string for given subtarget \p STI into \p Stream. void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream); @@ -846,6 +925,10 @@ }; } // end namespace AMDGPU + +raw_ostream &operator<<(raw_ostream &OS, + const AMDGPU::IsaInfo::TargetIDSetting S); + } // end namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -267,6 +267,94 @@ namespace IsaInfo { +AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI) + : XnackSetting(TargetIDSetting::Any), SramEccSetting(TargetIDSetting::Any) { + if (!STI.getFeatureBits().test(FeatureSupportsXNACK)) + XnackSetting = TargetIDSetting::Unsupported; + if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC)) + SramEccSetting = TargetIDSetting::Unsupported; +} + +void AMDGPUTargetID::setTargetIDFromFeaturesString(StringRef FS) { + // Check if xnack or sramecc is explicitly enabled or disabled. In the + // absence of the target features we assume we must generate code that can run + // in any environment. + SubtargetFeatures Features(FS); + Optional XnackRequested; + Optional SramEccRequested; + + for (const std::string &Feature : Features.getFeatures()) { + if (Feature == "+xnack") + XnackRequested = true; + else if (Feature == "-xnack") + XnackRequested = false; + else if (Feature == "+sramecc") + SramEccRequested = true; + else if (Feature == "-sramecc") + SramEccRequested = false; + } + + bool XnackSupported = isXnackSupported(); + bool SramEccSupported = isSramEccSupported(); + + if (XnackRequested.hasValue()) { + if (XnackSupported) { + XnackSetting = + *XnackRequested ? TargetIDSetting::On : TargetIDSetting::Off; + } else { + // If a specific xnack setting was requested and this GPU does not support + // xnack emit a warning. Setting will remain set to "Unsupported". + if (*XnackRequested) { + errs() << "warning: xnack 'On' was requested for a processor that does " + "not support it!\n"; + } else { + errs() << "warning: xnack 'Off' was requested for a processor that " + "does not support it!\n"; + } + } + } + + if (SramEccRequested.hasValue()) { + if (SramEccSupported) { + SramEccSetting = + *SramEccRequested ? TargetIDSetting::On : TargetIDSetting::Off; + } else { + // If a specific sramecc setting was requested and this GPU does not + // support sramecc emit a warning. Setting will remain set to + // "Unsupported". + if (*SramEccRequested) { + errs() << "warning: sramecc 'On' was requested for a processor that " + "does not support it!\n"; + } else { + errs() << "warning: sramecc 'Off' was requested for a processor that " + "does not support it!\n"; + } + } + } +} + +static TargetIDSetting +getTargetIDSettingFromFeatureString(StringRef FeatureString) { + if (FeatureString.endswith("-")) + return TargetIDSetting::Off; + if (FeatureString.endswith("+")) + return TargetIDSetting::On; + + llvm_unreachable("Malformed feature string"); +} + +void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) { + SmallVector TargetIDSplit; + TargetID.split(TargetIDSplit, ':'); + + for (const auto &FeatureString : TargetIDSplit) { + if (FeatureString.startswith("xnack")) + XnackSetting = getTargetIDSettingFromFeatureString(FeatureString); + if (FeatureString.startswith("sramecc")) + SramEccSetting = getTargetIDSettingFromFeatureString(FeatureString); + } +} + void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) { auto TargetTriple = STI->getTargetTriple(); auto Version = getIsaVersion(STI->getCPU()); @@ -283,7 +371,7 @@ if (hasXNACK(*STI)) Stream << "+xnack"; if (hasSRAMECC(*STI)) - Stream << "+sram-ecc"; + Stream << "+sramecc"; Stream.flush(); } @@ -1631,4 +1719,24 @@ } } // namespace AMDGPU + +raw_ostream &operator<<(raw_ostream &OS, + const AMDGPU::IsaInfo::TargetIDSetting S) { + switch (S) { + case (AMDGPU::IsaInfo::TargetIDSetting::Unsupported): + OS << "Unsupported"; + break; + case (AMDGPU::IsaInfo::TargetIDSetting::Any): + OS << "Any"; + break; + case (AMDGPU::IsaInfo::TargetIDSetting::Off): + OS << "Off"; + break; + case (AMDGPU::IsaInfo::TargetIDSetting::On): + OS << "On"; + break; + } + return OS; +} + } // namespace llvm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s ; Check lowering of some large extractelement that use the stack ; instead of register indexing. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -4,32 +4,14 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(<4 x i128> addrspace(4)* inreg %ptr, i32 inreg %idx) { -; GFX9-LABEL: extractelement_sgpr_v4i128_sgpr_idx: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 -; GFX9-NEXT: s_lshl_b32 m0, s4, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_movrels_b64 s[0:1], s[8:9] -; GFX9-NEXT: s_movrels_b64 s[2:3], s[10:11] -; GFX9-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: extractelement_sgpr_v4i128_sgpr_idx: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 -; GFX8-NEXT: s_lshl_b32 m0, s4, 1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_movrels_b64 s[0:1], s[8:9] -; GFX8-NEXT: s_movrels_b64 s[2:3], s[10:11] -; GFX8-NEXT: ; return to shader part epilog -; -; GFX7-LABEL: extractelement_sgpr_v4i128_sgpr_idx: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 -; GFX7-NEXT: s_lshl_b32 m0, s4, 1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_movrels_b64 s[0:1], s[8:9] -; GFX7-NEXT: s_movrels_b64 s[2:3], s[10:11] -; GFX7-NEXT: ; return to shader part epilog +; GCN-LABEL: extractelement_sgpr_v4i128_sgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 +; GCN-NEXT: s_lshl_b32 m0, s4, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_movrels_b64 s[0:1], s[8:9] +; GCN-NEXT: s_movrels_b64 s[2:3], s[10:11] +; GCN-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 %idx ret i128 %element @@ -112,57 +94,57 @@ ; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 1, v2 -; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX9-NEXT: v_add_u32_e32 v17, 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e64 v10, v2, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v8, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v9, v11, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, v13, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v10, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v17, v11, v7, vcc ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v3 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v14, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -1672,7 +1672,7 @@ ; GPRIDX-NEXT: is_ptr64 = 1 ; GPRIDX-NEXT: is_dynamic_callstack = 0 ; GPRIDX-NEXT: is_debug_enabled = 0 -; GPRIDX-NEXT: is_xnack_enabled = 0 +; GPRIDX-NEXT: is_xnack_enabled = 1 ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 @@ -2186,7 +2186,7 @@ ; GPRIDX-NEXT: is_ptr64 = 1 ; GPRIDX-NEXT: is_dynamic_callstack = 0 ; GPRIDX-NEXT: is_debug_enabled = 0 -; GPRIDX-NEXT: is_xnack_enabled = 0 +; GPRIDX-NEXT: is_xnack_enabled = 1 ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 @@ -2361,7 +2361,7 @@ ; GPRIDX-NEXT: is_ptr64 = 1 ; GPRIDX-NEXT: is_dynamic_callstack = 0 ; GPRIDX-NEXT: is_debug_enabled = 0 -; GPRIDX-NEXT: is_xnack_enabled = 0 +; GPRIDX-NEXT: is_xnack_enabled = 1 ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -7,17 +7,18 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.ptr, <64 x i32> addrspace(1)* %ptr, i32 %val, i32 %idx) #0 { ; GCN-LABEL: v_insert_v64i32_varidx: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GCN-NEXT: s_add_u32 s0, s0, s7 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GCN-NEXT: v_mov_b32_e32 v16, 0x100 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: v_add_u32_e32 v31, 64, v16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x0 ; GCN-NEXT: s_load_dwordx16 s[52:67], s[10:11], 0x40 ; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x80 -; GCN-NEXT: v_add_u32_e32 v31, 64, v16 ; GCN-NEXT: v_add_u32_e32 v32, 0x44, v16 +; GCN-NEXT: v_add_u32_e32 v33, 0x48, v16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 @@ -43,7 +44,6 @@ ; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v1, s53 ; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v33, 0x48, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s54 ; GCN-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v34, 0x4c, v16 @@ -175,7 +175,7 @@ ; GCN-NEXT: buffer_store_dword v1, v75, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v76, 0xf4, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s49 -; GCN-NEXT: s_and_b32 s5, s5, 63 +; GCN-NEXT: s_and_b32 s4, s7, 63 ; GCN-NEXT: buffer_store_dword v1, v76, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v77, 0xf8, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s50 @@ -183,7 +183,7 @@ ; GCN-NEXT: buffer_store_dword v1, v77, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v78, 0xfc, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s51 -; GCN-NEXT: s_lshl_b32 s5, s5, 2 +; GCN-NEXT: s_lshl_b32 s4, s4, 2 ; GCN-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v18, 12, v16 ; GCN-NEXT: v_add_u32_e32 v19, 16, v16 @@ -199,8 +199,8 @@ ; GCN-NEXT: v_add_u32_e32 v29, 56, v16 ; GCN-NEXT: v_add_u32_e32 v30, 60, v16 ; GCN-NEXT: buffer_store_dword v1, v78, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, s5, v16 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_add_u32_e32 v1, s4, v16 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen @@ -216,6 +216,7 @@ ; GCN-NEXT: buffer_store_dword v15, v30, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v2, v17, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v3, v18, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v4, v19, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -2133,8 +2133,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v16i16_s_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: s_and_b32 s1, s3, 1 ; GFX9-NEXT: s_lshr_b32 s12, s3, 1 ; GFX9-NEXT: s_mov_b32 s0, 0xffff @@ -2152,26 +2152,26 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX9-NEXT: v_and_or_b32 v10, v1, s13, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[10:11] +; GFX9-NEXT: v_and_or_b32 v12, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off @@ -2843,8 +2843,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v16i16_s_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX9-NEXT: s_mov_b32 s0, 0xffff @@ -2862,25 +2862,25 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] -; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] +; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off @@ -2992,8 +2992,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v16i16_v_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: s_and_b32 s1, s2, 1 ; GFX9-NEXT: s_lshr_b32 s12, s2, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 @@ -3009,26 +3009,26 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] -; GFX9-NEXT: v_and_or_b32 v11, v1, s13, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[10:11] +; GFX9-NEXT: v_and_or_b32 v12, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off @@ -3140,8 +3140,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v16i16_v_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 @@ -3158,25 +3158,25 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] ; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -38,6 +38,7 @@ ; GCN-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:48 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128 ; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192 ; GCN-NEXT: s_waitcnt vmcnt(7) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -1486,10 +1486,10 @@ ; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -256,38 +256,38 @@ ; GFX10_W32-LABEL: test_div_fmas_f32: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x4 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dword s7, s[0:1], 0xb8 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s4 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10_W32-NEXT: v_div_fmas_f32 v0, s5, v0, v1 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s7 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x4 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dword s7, s[0:1], 0xb8 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s4 -; GFX10_W64-NEXT: v_div_fmas_f32 v0, s5, v0, v1 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s7 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) store float %result, float addrspace(1)* %out, align 4 @@ -334,34 +334,34 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) store float %result, float addrspace(1)* %out, align 4 @@ -408,34 +408,34 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x58 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x58 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x34 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x58 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x58 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x34 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) store float %result, float addrspace(1)* %out, align 4 @@ -482,34 +482,34 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) store float %result, float addrspace(1)* %out, align 4 @@ -560,36 +560,36 @@ ; GFX10_W32-LABEL: test_div_fmas_f64: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x1 -; GFX10_W32-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX10_W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10_W32-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s8, 1, s8 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s4 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s6 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s5 -; GFX10_W32-NEXT: v_mov_b32_e32 v3, s7 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 -; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] +; GFX10_W32-NEXT: s_and_b32 s0, 1, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s8 +; GFX10_W32-NEXT: v_mov_b32_e32 v2, s10 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s9 +; GFX10_W32-NEXT: v_mov_b32_e32 v3, s11 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX10_W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10_W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10_W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f64: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x1 -; GFX10_W64-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX10_W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10_W64-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s8, 1, s8 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX10_W64-NEXT: v_mov_b32_e32 v2, s6 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s5 -; GFX10_W64-NEXT: v_mov_b32_e32 v3, s7 -; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] +; GFX10_W64-NEXT: s_and_b32 s0, 1, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s8 +; GFX10_W64-NEXT: v_mov_b32_e32 v2, s10 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s9 +; GFX10_W64-NEXT: v_mov_b32_e32 v3, s11 +; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX10_W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX10_W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10_W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10_W64-NEXT: s_endpgm %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) store double %result, double addrspace(1)* %out, align 8 @@ -639,35 +639,35 @@ ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x1 ; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x1 ; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W64-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %cmp = icmp eq i32 %i, 0 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) @@ -713,34 +713,34 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, 0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) store float %result, float addrspace(1)* %out, align 4 @@ -785,34 +785,34 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, 1 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, 1 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) store float %result, float addrspace(1)* %out, align 4 @@ -890,15 +890,15 @@ ; GFX10_W32-NEXT: s_clause 0x2 ; GFX10_W32-NEXT: global_load_dword v2, v1, s[6:7] ; GFX10_W32-NEXT: global_load_dword v3, v1, s[6:7] offset:4 -; GFX10_W32-NEXT: global_load_dword v1, v1, s[6:7] offset:8 +; GFX10_W32-NEXT: global_load_dword v4, v1, s[6:7] offset:8 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) -; GFX10_W32-NEXT: v_div_fmas_f32 v0, v2, v3, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W32-NEXT: v_div_fmas_f32 v0, v2, v3, v4 ; GFX10_W32-NEXT: global_store_dword v1, v0, s[4:5] offset:8 ; GFX10_W32-NEXT: s_endpgm ; @@ -912,15 +912,15 @@ ; GFX10_W64-NEXT: s_clause 0x2 ; GFX10_W64-NEXT: global_load_dword v2, v1, s[6:7] ; GFX10_W64-NEXT: global_load_dword v3, v1, s[6:7] offset:4 -; GFX10_W64-NEXT: global_load_dword v1, v1, s[6:7] offset:8 +; GFX10_W64-NEXT: global_load_dword v4, v1, s[6:7] offset:8 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 ; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[0:1] ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-NEXT: v_div_fmas_f32 v0, v2, v3, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W64-NEXT: v_div_fmas_f32 v0, v2, v3, v4 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[4:5] offset:8 ; GFX10_W64-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -51,9 +51,9 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 +; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -117,9 +117,9 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v0, v1 +; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -182,13 +182,13 @@ ; GFX10-LABEL: test_div_scale_f64_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -253,13 +253,13 @@ ; GFX10-LABEL: test_div_scale_f64_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -796,14 +796,14 @@ ; GFX10-LABEL: test_div_scale_f32_all_scalar_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s3, s3, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_scale_f32 v0, s0, s5, s5, s4 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -841,14 +841,14 @@ ; GFX10-LABEL: test_div_scale_f32_all_scalar_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s2, s3, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s5, s4 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) %result0 = extractvalue { float, i1 } %result, 0 @@ -887,11 +887,11 @@ ; ; GFX10-LABEL: test_div_scale_f64_all_scalar_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3] @@ -934,11 +934,11 @@ ; ; GFX10-LABEL: test_div_scale_f64_all_scalar_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3] @@ -1115,12 +1115,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1187,9 +1187,9 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v2 ; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll @@ -30,6 +30,18 @@ ; GFX81-NEXT: s_mov_b32 s7, s9 ; GFX81-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16 ; GFX81-NEXT: s_endpgm +; PACKED-LABEL: image_store_f16: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_mov_b32 s0, s2 +; PACKED-NEXT: s_mov_b32 s1, s3 +; PACKED-NEXT: s_mov_b32 s2, s4 +; PACKED-NEXT: s_mov_b32 s3, s5 +; PACKED-NEXT: s_mov_b32 s4, s6 +; PACKED-NEXT: s_mov_b32 s5, s7 +; PACKED-NEXT: s_mov_b32 s6, s8 +; PACKED-NEXT: s_mov_b32 s7, s9 +; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16 +; PACKED-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } @@ -61,6 +73,18 @@ ; GFX81-NEXT: s_mov_b32 s7, s9 ; GFX81-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16 ; GFX81-NEXT: s_endpgm +; PACKED-LABEL: image_store_v2f16: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_mov_b32 s0, s2 +; PACKED-NEXT: s_mov_b32 s1, s3 +; PACKED-NEXT: s_mov_b32 s2, s4 +; PACKED-NEXT: s_mov_b32 s3, s5 +; PACKED-NEXT: s_mov_b32 s4, s6 +; PACKED-NEXT: s_mov_b32 s5, s7 +; PACKED-NEXT: s_mov_b32 s6, s8 +; PACKED-NEXT: s_mov_b32 s7, s9 +; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16 +; PACKED-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } @@ -135,6 +159,18 @@ ; GFX81-NEXT: s_mov_b32 s7, s9 ; GFX81-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm d16 ; GFX81-NEXT: s_endpgm +; PACKED-LABEL: image_store_v4f16: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_mov_b32 s0, s2 +; PACKED-NEXT: s_mov_b32 s1, s3 +; PACKED-NEXT: s_mov_b32 s2, s4 +; PACKED-NEXT: s_mov_b32 s3, s5 +; PACKED-NEXT: s_mov_b32 s4, s6 +; PACKED-NEXT: s_mov_b32 s5, s7 +; PACKED-NEXT: s_mov_b32 s6, s8 +; PACKED-NEXT: s_mov_b32 s7, s9 +; PACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16 +; PACKED-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -21,14 +21,14 @@ ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; encoding: [0x00,0x01,0x00,0xf4,0x2c,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; encoding: [0x80,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; encoding: [0x80,0x02,0x02,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; encoding: [0x04,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x00,0x00] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x02,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 true) #0 store i32 %tmp0, i32 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll @@ -4,6 +4,21 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -19,7 +34,12 @@ ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret half %val +} + +define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 @@ -31,14 +51,9 @@ ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) - ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) - ret half %val -} - -define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -65,21 +80,6 @@ ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; PACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset - ; PACKED: bb.1 (%ir-block.0): - ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <2 x half> %val } @@ -91,6 +91,24 @@ ; } define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 + ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 + ; PACKED: $vgpr0 = COPY [[COPY7]] + ; PACKED: $vgpr1 = COPY [[COPY8]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; UNPACKED-LABEL: name: struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -127,29 +145,27 @@ ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret <4 x half> %val +} + +define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 ; PACKED: bb.1 (%ir-block.0): - ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "TargetCustom7", align 1, addrspace 4) - ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 - ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 - ; PACKED: $vgpr0 = COPY [[COPY7]] - ; PACKED: $vgpr1 = COPY [[COPY8]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) - ret <4 x half> %val -} - -define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -166,27 +182,57 @@ ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 - ; PACKED: bb.1 (%ir-block.0): - ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 - ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) - ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 78, i32 0) ret half %val } define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: successors: %bb.2(0x80000000) + ; PACKED: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; PACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; PACKED: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; PACKED: bb.2: + ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec + ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; PACKED: bb.3: + ; PACKED: successors: %bb.4(0x80000000) + ; PACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; PACKED: bb.4: + ; PACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 + ; PACKED: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 + ; PACKED: $vgpr0 = COPY [[COPY11]] + ; PACKED: $vgpr1 = COPY [[COPY12]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; UNPACKED-LABEL: name: struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: successors: %bb.2(0x80000000) @@ -251,72 +297,11 @@ ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset - ; PACKED: bb.1 (%ir-block.0): - ; PACKED: successors: %bb.2(0x80000000) - ; PACKED: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; PACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; PACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; PACKED: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; PACKED: bb.2: - ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec - ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec - ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "TargetCustom7", align 1, addrspace 4) - ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; PACKED: bb.3: - ; PACKED: successors: %bb.4(0x80000000) - ; PACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; PACKED: bb.4: - ; PACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 - ; PACKED: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 - ; PACKED: $vgpr0 = COPY [[COPY11]] - ; PACKED: $vgpr1 = COPY [[COPY12]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <4 x half> %val } define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { - ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 - ; UNPACKED: bb.1 (%ir-block.0): - ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; UNPACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; UNPACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7" + 4095, align 1, addrspace 4) - ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] - ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 ; PACKED: bb.1 (%ir-block.0): ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -332,6 +317,21 @@ ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7" + 4095, align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7" + 4095, align 1, addrspace 4) + ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret half %val diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -20,15 +20,15 @@ ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) store i32 %tmp0, i32 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -20,52 +20,52 @@ ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 11, v0 ; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v[2:3], off offset:-10 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[2:3], off offset:-9 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[2:3], off offset:-8 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-7 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-6 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-5 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-4 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off offset:-3 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-2 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off offset:-1 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xff +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v15, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v14, v[2:3], off offset:-10 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v13, v[2:3], off offset:-9 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off offset:-8 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-7 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off offset:-6 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-5 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-4 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-3 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-2 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[2:3], off offset:-1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8 ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v2, s5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v13 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, s4, v12 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v15, s4, v2 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v11, v13, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v12, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v5, v6, v3, v7 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v9 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v3, v10, v3, v11 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v4 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v7 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v3, v8, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v4, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 24, v12 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v10, v11, v0, v10 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v7, v7, v0, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v6 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v10, v9, v8 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v7, v5, v4 ; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1: @@ -158,26 +158,26 @@ ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 10, v0 ; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off offset:-8 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[2:3], off offset:-6 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off offset:-4 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-2 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v9, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v8, v[2:3], off offset:-8 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v7, v[2:3], off offset:-6 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-4 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off offset:-2 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v8 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v6, v2 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v4, v3, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v6, v3, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v7, v2, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v9, s4, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v5, v2, v3 ; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2: @@ -398,52 +398,52 @@ ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX9-NOUNALIGNED: ; %bb.0: ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v0, s[0:1] offset:1 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v0, s[0:1] offset:2 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v0, s[0:1] offset:3 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v0, s[0:1] offset:4 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v0, s[0:1] offset:5 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v0, s[0:1] offset:6 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v0, s[0:1] offset:7 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v0, s[0:1] offset:9 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v0, s[0:1] offset:10 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v0, s[0:1] offset:11 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, 0xff +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v0, s[0:1] +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v0, s[0:1] offset:1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v0, s[0:1] offset:2 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v0, s[0:1] offset:3 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v0, s[0:1] offset:4 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v0, s[0:1] offset:5 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v0, s[0:1] offset:6 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v0, s[0:1] offset:7 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v0, s[0:1] offset:8 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v0, s[0:1] offset:9 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v0, s[0:1] offset:10 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v0, s[0:1] offset:11 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8 ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v11, s1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v10, s0, v10 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v1, s0, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v11, v12, s0, v11 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v6, s1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v7, s1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v12 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v12 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v12 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v0, v12 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v4, v5, s0, v6 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 24, v8 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v7, v9, v12, v10 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 24, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v1, v2, v3 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v4, v5, v6 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v7, v8, v9 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v1, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v3, v4, v0, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v7, v8, s0, v7 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v11, v10, v9 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v7, v6, v5 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v3, v2, v4 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 @@ -543,25 +543,25 @@ ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX9-NOUNALIGNED: ; %bb.0: ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v0, s[0:1] -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v0, s[0:1] offset:4 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v0, s[0:1] offset:6 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v0, s[0:1] offset:8 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v0, s[0:1] offset:10 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v0, s[0:1] +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v0, s[0:1] offset:2 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v0, s[0:1] offset:4 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v0, s[0:1] offset:6 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v0, s[0:1] offset:10 ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v5 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v1, s0, v2 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v3, s0, v4 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v5, s0, v6 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v4, s0, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, s0, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v6, s0, v0 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 @@ -619,10 +619,10 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(<3 x i32> addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -641,10 +641,10 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(i96 addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_i96_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -663,10 +663,10 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(<3 x i32> addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -685,10 +685,10 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(<6 x i16> addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_v6i16_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -97,14 +97,14 @@ ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, gv3@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, gv3@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v0, v0, s[4:5] -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v0, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: BB1_2: ; %Flow ; GFX9-NEXT: s_xor_b32 s0, s0, -1 ; GFX9-NEXT: s_and_b32 s0, s0, 1 @@ -118,8 +118,8 @@ ; GFX9-NEXT: s_add_u32 s2, s2, gv1@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, gv1@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -36,17 +36,17 @@ ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_load_dword s4, s[4:5], 0x10 -; GCN-NEXT: s_add_u32 s5, s32, 0x1000 -; GCN-NEXT: s_add_u32 s8, s5, 4 +; GCN-NEXT: s_load_dword s8, s[4:5], 0x10 +; GCN-NEXT: s_add_u32 s4, s32, 0x1000 +; GCN-NEXT: s_add_u32 s5, s4, 4 +; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 2 +; GCN-NEXT: s_lshl_b32 s5, s8, 2 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s8 -; GCN-NEXT: s_add_u32 s4, s5, s4 +; GCN-NEXT: s_add_u32 s4, s4, s5 ; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen @@ -108,18 +108,18 @@ ; GCN-NEXT: s_cbranch_scc1 BB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_load_dword s4, s[4:5], 0xc -; GCN-NEXT: s_add_u32 s5, s32, 0x1000 -; GCN-NEXT: s_and_b32 s5, s5, 0xfffff000 -; GCN-NEXT: s_add_u32 s8, s5, 4 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xc +; GCN-NEXT: s_add_u32 s4, s32, 0x1000 +; GCN-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GCN-NEXT: s_add_u32 s5, s4, 4 +; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 2 -; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: s_lshl_b32 s5, s8, 2 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s8 -; GCN-NEXT: s_add_u32 s4, s5, s4 +; GCN-NEXT: s_add_u32 s4, s4, s5 ; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -8,14 +8,14 @@ define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; @@ -39,51 +39,51 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s5, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 -; GFX9-NEXT: s_lshr_b32 s7, s0, 24 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s3, s4, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b32 s0, s5, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-NEXT: s_lshr_b32 s1, s5, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s5, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_lshr_b32 s0, s2, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 24 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s6, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_lshr_b32 s0, s3, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: s_lshr_b32 s0, s7, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s3, 16 +; GFX9-NEXT: s_lshr_b32 s1, s7, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:13 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s2, s3, 24 +; GFX9-NEXT: s_lshr_b32 s2, s7, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:14 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:15 @@ -148,27 +148,27 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b32 s0, s5, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshr_b32 s0, s6, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_lshr_b32 s0, s3, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: s_lshr_b32 s0, s7, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:14 @@ -209,15 +209,15 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: ds_write2_b32 v1, v3, v0 offset0:2 offset1:3 ; GFX9-NEXT: s_endpgm ; @@ -242,14 +242,14 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; @@ -273,14 +273,14 @@ define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -8,13 +8,13 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; @@ -37,42 +37,42 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: s_lshr_b32 s0, s12, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-NEXT: s_lshr_b32 s6, s0, 24 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s1, s12, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s3, s12, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: s_lshr_b32 s0, s13, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshr_b32 s4, s1, 24 +; GFX9-NEXT: s_lshr_b32 s1, s13, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s13, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_lshr_b32 s0, s2, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-NEXT: s_lshr_b32 s0, s14, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s2, 16 +; GFX9-NEXT: s_lshr_b32 s1, s14, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s3, s2, 24 +; GFX9-NEXT: s_lshr_b32 s2, s14, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX9-NEXT: s_endpgm ; @@ -124,22 +124,22 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: s_lshr_b32 s0, s12, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: s_lshr_b32 s0, s13, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-NEXT: s_lshr_b32 s0, s14, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 @@ -175,13 +175,13 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s14 ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX9-NEXT: s_endpgm @@ -206,13 +206,13 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s14 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX9-NEXT: s_endpgm @@ -237,13 +237,13 @@ define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -360,7 +360,6 @@ ; IR-LABEL: @select_mul_lhs_const_i32( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000 ; IR-NEXT: ret i32 [[OP]] -; %select = select i1 %cond, i32 5, i32 8 %op = mul i32 1000, %select ret i32 %op @@ -380,7 +379,6 @@ ; IR-LABEL: @select_mul_rhs_const_i32( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000 ; IR-NEXT: ret i32 [[OP]] -; %select = select i1 %cond, i32 5, i32 8 %op = mul i32 %select, 1000 ret i32 %op @@ -420,7 +418,6 @@ ; IR-LABEL: @select_add_trunc_select( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50 ; IR-NEXT: ret i16 [[OP]] -; %select = select i1 %cond, i32 5, i32 8 %trunc = trunc i32 %select to i16 %op = add i16 %trunc, 42 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -177,33 +177,33 @@ ; ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB1_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 +; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB1_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm @@ -244,28 +244,28 @@ ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s1, s0, s1 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -277,20 +277,20 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s1, s0, s1 -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: s_mul_i32 s3, s2, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -299,9 +299,9 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -313,20 +313,20 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB1_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -335,9 +335,9 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 @@ -1729,33 +1729,33 @@ ; ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB9_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 +; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB9_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm @@ -1796,28 +1796,28 @@ ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB9_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s1, s0, s1 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB9_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -1829,20 +1829,20 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s1, s0, s1 -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: s_mul_i32 s3, s2, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -1851,9 +1851,9 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -1865,20 +1865,20 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB9_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -1887,9 +1887,9 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=-xnack -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s ; For gfx1010, overestimate the branch size in case we need to insert ; a nop for the buggy offset. diff --git a/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir --- a/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir +++ b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir @@ -1,7 +1,9 @@ # RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s -# Make sure the default assumption is xnack enabled with no cpu -# RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+volcanic-islands -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s +# Make sure the default assumption is xnack not supported when no cpu is specified. +# RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+volcanic-islands -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s + +# Make sure the assumption is xnack not supported when fiji cpu is specified. # RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s --- # Trivial clause at beginning of program diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -240,6 +240,7 @@ ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 ; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 ; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -395,6 +396,7 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_short_d16_hi v0, v[0:1], off ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -417,6 +419,7 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_load_ushort v2, v[0:1] offset:2 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: flat_load_short_d16_hi v0, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -1,7 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefixes=CHECK,GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefixes=CHECK,GCN %s ; RUN: FileCheck --enable-var-scope --check-prefixes=CHECK,DBG %s < %t ; REQUIRES: asserts +; FIXME: Verifier error with xnack enabled. + ; CHECK-LABEL: {{^}}cluster_load_cluster_store: define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noalias %sb) { bb: diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICI,SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICI %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICIVI,SICI,SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICIVI,SICI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,SICIVI,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}load_i32: ; GCN-DAG: s_mov_b32 s3, 0 @@ -9,8 +9,8 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 +; GFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 +; GFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 define amdgpu_vs float @load_i32(i32 addrspace(6)* inreg %p0, i32 addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds i32, i32 addrspace(6)* %p1, i32 2 %r0 = load i32, i32 addrspace(6)* %p0 @@ -21,13 +21,18 @@ } ; GCN-LABEL: {{^}}load_v2i32: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 +; SICIVI-DAG: s_mov_b32 s3, 0 +; SICIVI-DAG: s_mov_b32 s2, s1 +; SICIVI-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +; GFX9-DAG: s_mov_b32 s6, s1 +; GFX9-DAG: s_mov_b32 s7, 0 +; GFX9-DAG: s_mov_b32 s1, s7 +; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[6:7], 0x10 define amdgpu_vs <2 x float> @load_v2i32(<2 x i32> addrspace(6)* inreg %p0, <2 x i32> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(6)* %p1, i32 2 %r0 = load <2 x i32>, <2 x i32> addrspace(6)* %p0 @@ -43,8 +48,10 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 +; VI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 +; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 define amdgpu_vs <4 x float> @load_v4i32(<4 x i32> addrspace(6)* inreg %p0, <4 x i32> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(6)* %p1, i32 2 %r0 = load <4 x i32>, <4 x i32> addrspace(6)* %p0 @@ -60,8 +67,10 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +; VI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 define amdgpu_vs <8 x float> @load_v8i32(<8 x i32> addrspace(6)* inreg %p0, <8 x i32> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(6)* %p1, i32 2 %r0 = load <8 x i32>, <8 x i32> addrspace(6)* %p0 @@ -77,8 +86,10 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +; VI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 define amdgpu_vs <16 x float> @load_v16i32(<16 x i32> addrspace(6)* inreg %p0, <16 x i32> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <16 x i32>, <16 x i32> addrspace(6)* %p1, i32 2 %r0 = load <16 x i32>, <16 x i32> addrspace(6)* %p0 @@ -94,8 +105,10 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 +; VI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 +; VI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 +; GFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 +; GFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 define amdgpu_vs float @load_float(float addrspace(6)* inreg %p0, float addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds float, float addrspace(6)* %p1, i32 2 %r0 = load float, float addrspace(6)* %p0 @@ -105,13 +118,18 @@ } ; GCN-LABEL: {{^}}load_v2float: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 +; SICIVI-DAG: s_mov_b32 s3, 0 +; SICIVI-DAG: s_mov_b32 s2, s1 +; SICIVI-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +; GFX9-DAG: s_mov_b32 s6, s1 +; GFX9-DAG: s_mov_b32 s7, 0 +; GFX9-DAG: s_mov_b32 s1, s7 +; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[6:7], 0x10 define amdgpu_vs <2 x float> @load_v2float(<2 x float> addrspace(6)* inreg %p0, <2 x float> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(6)* %p1, i32 2 %r0 = load <2 x float>, <2 x float> addrspace(6)* %p0 @@ -126,8 +144,10 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 +; VI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 +; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 define amdgpu_vs <4 x float> @load_v4float(<4 x float> addrspace(6)* inreg %p0, <4 x float> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <4 x float>, <4 x float> addrspace(6)* %p1, i32 2 %r0 = load <4 x float>, <4 x float> addrspace(6)* %p0 @@ -142,8 +162,10 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +; VI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 define amdgpu_vs <8 x float> @load_v8float(<8 x float> addrspace(6)* inreg %p0, <8 x float> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <8 x float>, <8 x float> addrspace(6)* %p1, i32 2 %r0 = load <8 x float>, <8 x float> addrspace(6)* %p0 @@ -158,8 +180,10 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +; VI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 define amdgpu_vs <16 x float> @load_v16float(<16 x float> addrspace(6)* inreg %p0, <16 x float> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <16 x float>, <16 x float> addrspace(6)* %p1, i32 2 %r0 = load <16 x float>, <16 x float> addrspace(6)* %p0 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -20,16 +20,16 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s0, s2 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 +; SI-NEXT: s_flbit_i32_b32 s5, s4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctlz_i32: @@ -376,23 +376,23 @@ define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s0, s2 -; SI-NEXT: s_flbit_i32_b32 s1, s3 -; SI-NEXT: s_add_i32 s0, s0, 32 -; SI-NEXT: s_or_b32 s2, s2, s3 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; SI-NEXT: s_flbit_i32_b32 s6, s4 +; SI-NEXT: s_flbit_i32_b32 s7, s5 +; SI-NEXT: s_add_i32 s6, s6, 32 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 +; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctlz_i64: @@ -440,22 +440,22 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s0, s2 -; SI-NEXT: s_flbit_i32_b32 s1, s3 -; SI-NEXT: s_add_i32 s0, s0, 32 -; SI-NEXT: s_or_b32 s2, s2, s3 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; SI-NEXT: s_flbit_i32_b32 s6, s4 +; SI-NEXT: s_flbit_i32_b32 s7, s5 +; SI-NEXT: s_add_i32 s6, s6, 32 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 +; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctlz_i64_trunc: diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll --- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll +++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll @@ -17,8 +17,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx704 < %s | FileCheck --check-prefixes=GFX704 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck --check-prefixes=GFX704 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx705 < %s | FileCheck --check-prefixes=GFX705 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 < %s | FileCheck --check-prefixes=GFX801 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=carrizo < %s | FileCheck --check-prefixes=GFX801 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx801 < %s | FileCheck --check-prefixes=GFX801 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=carrizo < %s | FileCheck --check-prefixes=GFX801 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 < %s | FileCheck --check-prefixes=GFX802 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland < %s | FileCheck --check-prefixes=GFX802 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck --check-prefixes=GFX802 %s @@ -38,11 +38,14 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack < %s | FileCheck --check-prefixes=XNACK-GFX900 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=-xnack < %s | FileCheck --check-prefixes=NO-XNACK-GFX902 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+sram-ecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX904 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sram-ecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+sramecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX904 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sramecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+sram-ecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX904 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sram-ecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+sramecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX904 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sramecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s + +; FIXME: With the default attributes these directives are not accurate for +; xnack and sramecc. Subsequent Target-ID patches will address this. ; GFX600: .amdgcn_target "amdgcn-amd-amdhsa--gfx600" ; GFX601: .amdgcn_target "amdgcn-amd-amdhsa--gfx601" @@ -53,24 +56,24 @@ ; GFX703: .amdgcn_target "amdgcn-amd-amdhsa--gfx703" ; GFX704: .amdgcn_target "amdgcn-amd-amdhsa--gfx704" ; GFX705: .amdgcn_target "amdgcn-amd-amdhsa--gfx705" -; GFX801: .amdgcn_target "amdgcn-amd-amdhsa--gfx801+xnack" +; GFX801: .amdgcn_target "amdgcn-amd-amdhsa--gfx801" ; GFX802: .amdgcn_target "amdgcn-amd-amdhsa--gfx802" ; GFX803: .amdgcn_target "amdgcn-amd-amdhsa--gfx803" ; GFX805: .amdgcn_target "amdgcn-amd-amdhsa--gfx805" -; GFX810: .amdgcn_target "amdgcn-amd-amdhsa--gfx810+xnack" +; GFX810: .amdgcn_target "amdgcn-amd-amdhsa--gfx810" ; GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900" -; GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902+xnack" +; GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902" ; GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904" ; GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906" ; XNACK-GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900+xnack" ; NO-XNACK-GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902" -; SRAM-ECC-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+sram-ecc" -; SRAM-ECC-GFX906: "amdgcn-amd-amdhsa--gfx906+sram-ecc" +; SRAM-ECC-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+sramecc" +; SRAM-ECC-GFX906: "amdgcn-amd-amdhsa--gfx906+sramecc" -; SRAM-ECC-XNACK-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack+sram-ecc" -; SRAM-ECC-XNACK-GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack+sram-ecc" +; SRAM-ECC-XNACK-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack+sramecc" +; SRAM-ECC-XNACK-GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack+sramecc" define amdgpu_kernel void @directive_amdgcn_target() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -321,11 +321,11 @@ ; GFX9-LABEL: read2_ptr_is_subreg_arg_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 ; GFX9-NEXT: ds_read_b32 v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -374,11 +374,11 @@ ; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 ; GFX9-NEXT: ds_read_b32 v2, v2 offset:32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -567,10 +567,10 @@ ; GFX9-ALIGNED-LABEL: unaligned_read2_f32: ; GFX9-ALIGNED: ; %bb.0: ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-ALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 ; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 ; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:1 ; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:2 @@ -596,10 +596,10 @@ ; GFX9-UNALIGNED-LABEL: unaligned_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s4, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 @@ -661,10 +661,10 @@ ; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-ALIGNED: ; %bb.0: ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-ALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 ; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 offset:5 ; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:6 ; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:7 @@ -690,10 +690,10 @@ ; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s0, v2, 5 +; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s4, v2, 5 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 @@ -745,10 +745,10 @@ ; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-ALIGNED: ; %bb.0: ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-ALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 ; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1 ; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 ; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 @@ -764,10 +764,10 @@ ; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s4, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1210,10 +1210,10 @@ ; GFX9-LABEL: misaligned_read2_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -1241,10 +1241,10 @@ ; GFX9-LABEL: misaligned_read2_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -1288,16 +1288,16 @@ ; ; GFX9-LABEL: ds_read_diff_base_interleaving: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, s0, v1 -; GFX9-NEXT: v_add_u32_e32 v3, s1, v0 -; GFX9-NEXT: v_add_u32_e32 v4, s2, v1 -; GFX9-NEXT: v_add_u32_e32 v6, s3, v0 +; GFX9-NEXT: v_add_u32_e32 v2, s4, v1 +; GFX9-NEXT: v_add_u32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v4, s6, v1 +; GFX9-NEXT: v_add_u32_e32 v6, s7, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[2:3], v3 offset1:4 ; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 @@ -1312,7 +1312,7 @@ ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: global_store_dword v8, v0, s[4:5] offset:40 +; GFX9-NEXT: global_store_dword v8, v0, s[2:3] offset:40 ; GFX9-NEXT: s_endpgm float addrspace(1)* nocapture %arg, [4 x [4 x float]] addrspace(3)* %arg1, @@ -1388,17 +1388,18 @@ ; GFX9-NEXT: s_getpc_b64 s[36:37] ; GFX9-NEXT: s_mov_b32 s36, s0 ; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, void_func_void@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, void_func_void@gotpcrel32@hi+12 +; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s2 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-NEXT: ds_read_b32 v42, v41 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -197,14 +197,13 @@ ; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v5, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[3:4], v5, s[0:1] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v3, s[0:1] offset:8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8 +; GFX9-NEXT: ds_write2_b32 v0, v1, v4 offset1:8 ; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i @@ -523,17 +522,17 @@ ; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: ds_write_b32 v2, v1 offset:32 +; GFX9-NEXT: ds_write_b32 v0, v1 offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ds_write_b32 v3, v0 offset:32 +; GFX9-NEXT: ds_write_b32 v3, v2 offset:32 ; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i @@ -612,11 +611,11 @@ ; GFX9-LABEL: misaligned_simple_write2_one_val_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX9-NEXT: v_add_u32_e32 v2, s0, v2 +; GFX9-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:14 offset1:15 @@ -669,11 +668,11 @@ ; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: ; GFX9-ALIGNED: ; %bb.0: ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-ALIGNED-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x34 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s0, v2 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7 @@ -696,11 +695,11 @@ ; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x34 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v3, 5, v2 ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, 9, v2 ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -969,10 +968,10 @@ ; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-ALIGNED: ; %bb.0: ; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 -; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1 @@ -985,10 +984,10 @@ ; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v4, v0, 4, s4 -; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll @@ -61,6 +61,9 @@ ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1032 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1032 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1033 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1033 %s +; FIXME: With the default attributes the eflags are not accurate for +; xnack and sramecc. Subsequent Target-ID patches will address this. + ; ARCH-R600: Format: elf32-amdgpu ; ARCH-R600: Arch: r600 ; ARCH-R600: AddressSize: 32bit @@ -96,19 +99,15 @@ ; GFX704: EF_AMDGPU_MACH_AMDGCN_GFX704 (0x26) ; GFX705: EF_AMDGPU_MACH_AMDGCN_GFX705 (0x3B) ; GFX801: EF_AMDGPU_MACH_AMDGCN_GFX801 (0x28) -; GFX801-NEXT: EF_AMDGPU_XNACK (0x100) ; GFX802: EF_AMDGPU_MACH_AMDGCN_GFX802 (0x29) ; GFX803: EF_AMDGPU_MACH_AMDGCN_GFX803 (0x2A) ; GFX805: EF_AMDGPU_MACH_AMDGCN_GFX805 (0x3C) ; GFX810: EF_AMDGPU_MACH_AMDGCN_GFX810 (0x2B) -; GFX810-NEXT: EF_AMDGPU_XNACK (0x100) ; GFX900: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) ; GFX902: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D) -; GFX902-NEXT: EF_AMDGPU_XNACK (0x100) ; GFX904: EF_AMDGPU_MACH_AMDGCN_GFX904 (0x2E) ; GFX906: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) ; GFX908: EF_AMDGPU_MACH_AMDGCN_GFX908 (0x30) -; GFX908-NEXT: EF_AMDGPU_SRAM_ECC (0x200) ; GFX909: EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31) ; GFX90C: EF_AMDGPU_MACH_AMDGCN_GFX90C (0x32) ; GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33) diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll @@ -1,24 +1,9 @@ -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX902 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 -mattr=-sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX902 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 -mattr=+sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX902 %s - ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=-sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc,+xnack < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s - -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx908 < %s | llvm-readobj -file-headers - | FileCheck --check-prefix=SRAM-ECC-GFX908 %s - -; NO-SRAM-ECC-GFX902: Flags [ -; NO-SRAM-ECC-GFX902-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D) -; NO-SRAM-ECC-GFX902-NEXT: EF_AMDGPU_XNACK (0x100) -; NO-SRAM-ECC-GFX902-NEXT: ] +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=-sramecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sramecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sramecc,+xnack < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s -; SRAM-ECC-GFX902: Flags [ -; SRAM-ECC-GFX902-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D) -; SRAM-ECC-GFX902-NEXT: EF_AMDGPU_SRAM_ECC (0x200) -; SRAM-ECC-GFX902-NEXT: EF_AMDGPU_XNACK (0x100) -; SRAM-ECC-GFX902-NEXT: ] +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx908 -mattr=+sramecc < %s | llvm-readobj -file-headers - | FileCheck --check-prefix=SRAM-ECC-GFX908 %s ; NO-SRAM-ECC-GFX906: Flags [ ; NO-SRAM-ECC-GFX906-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -33,9 +33,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ushort v3, v[0:1], off offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -32,18 +32,18 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen offset:2 +; GFX9-NEXT: buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2: ; GFX9-FLASTSCR: ; %bb.0: ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLASTSCR-NEXT: scratch_load_ushort v1, v0, off -; GFX9-FLASTSCR-NEXT: scratch_load_ushort v0, v0, off offset:2 +; GFX9-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off offset:2 ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -1,11 +1,11 @@ ; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=stoney -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=stoney -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo --amdhsa-code-object-version=2 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -228,8 +228,8 @@ ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] ; GFX9-PAL-NEXT: s_mov_b32 s4, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 @@ -807,8 +807,8 @@ ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] ; GFX9-PAL-NEXT: s_mov_b32 s4, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 @@ -1414,8 +1414,8 @@ ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] ; GFX9-PAL-NEXT: s_mov_b32 s4, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 @@ -1927,8 +1927,8 @@ ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] ; GFX9-PAL-NEXT: s_mov_b32 s4, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -7,26 +7,26 @@ ; SI-LABEL: frem_f16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:8 +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; SI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 +; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 ; SI-NEXT: v_rcp_f32_e32 v4, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 @@ -42,7 +42,7 @@ ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_f16: @@ -126,22 +126,22 @@ ; SI-LABEL: fast_frem_f16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:8 +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_rcp_f32_e32 v2, v1 @@ -149,7 +149,7 @@ ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: fast_frem_f16: @@ -216,22 +216,22 @@ ; SI-LABEL: unsafe_frem_f16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:8 +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_rcp_f32_e32 v2, v1 @@ -239,7 +239,7 @@ ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: unsafe_frem_f16: @@ -306,23 +306,23 @@ ; SI-LABEL: frem_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; SI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 +; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 ; SI-NEXT: v_rcp_f32_e32 v4, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 @@ -336,7 +336,7 @@ ; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_f32: @@ -421,26 +421,26 @@ ; SI-LABEL: fast_frem_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: fast_frem_f32: @@ -503,26 +503,26 @@ ; SI-LABEL: unsafe_frem_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: unsafe_frem_f32: @@ -585,20 +585,20 @@ ; SI-LABEL: frem_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s8 ; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] ; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -712,20 +712,20 @@ ; SI-LABEL: fast_frem_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s8 ; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] ; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -839,27 +839,27 @@ ; SI-LABEL: unsafe_frem_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s4 -; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] ; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 ; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s14 +; SI-NEXT: s_mov_b32 s0, s10 ; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 ; SI-NEXT: v_not_b32_e32 v6, v6 ; SI-NEXT: v_and_b32_e32 v6, v4, v6 @@ -873,7 +873,7 @@ ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc ; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: unsafe_frem_f64: @@ -1813,20 +1813,20 @@ ; SI-LABEL: frem_v2f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s8 ; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0 offset:64 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] ; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -47,15 +47,15 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_not_b32 s0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s4, 1 +; GFX9-NEXT: s_not_b32 s1, s6 +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -106,11 +106,11 @@ ; GFX9-LABEL: fshl_i32_imm: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 25 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 25 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -185,21 +185,21 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s0, s5, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_not_b32 s1, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 +; GFX9-NEXT: s_not_b32 s1, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s5, s5, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 +; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_not_b32 s1, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s1, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v3 +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -261,11 +261,11 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -365,37 +365,37 @@ ; ; GFX9-LABEL: fshl_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s0, s7, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: s_not_b32 s3, s3 +; GFX9-NEXT: s_not_b32 s1, s15 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s7, s7, 1 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1 +; GFX9-NEXT: v_alignbit_b32 v3, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_not_b32 s1, s14 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; GFX9-NEXT: s_lshr_b32 s3, s6, 1 -; GFX9-NEXT: v_alignbit_b32 v2, s3, v0, v1 +; GFX9-NEXT: s_lshr_b32 s0, s6, 1 +; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: s_not_b32 s1, s1 +; GFX9-NEXT: s_not_b32 s1, s13 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s5, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 +; GFX9-NEXT: s_lshr_b32 s0, s5, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_not_b32 s1, s12 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s1, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v5 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v4i32: @@ -470,20 +470,20 @@ ; ; GFX9-LABEL: fshl_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 23 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v4i32_imm: diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -50,11 +50,11 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm @@ -103,11 +103,11 @@ ; GFX9-LABEL: fshr_i32_imm: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 7 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 7 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -170,14 +170,14 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm @@ -236,11 +236,11 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -316,25 +316,25 @@ ; ; GFX9-LABEL: fshr_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-NEXT: v_alignbit_b32 v2, s6, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v4i32: @@ -401,20 +401,20 @@ ; ; GFX9-LABEL: fshr_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 9 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v4i32_imm: diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -389,6 +389,7 @@ ; FIXME: Should not scalarize ; GCN-LABEL: {{^}}v5i16_func_void: ; GFX9: buffer_load_dwordx2 v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v2 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -2508,13 +2508,13 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v4, s[4:5] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[4:5] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[4:5] offset:16 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 @@ -2540,7 +2540,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -2549,8 +2549,8 @@ ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v4, s[4:5] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[4:5] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[4:5] offset:16 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 @@ -2654,15 +2654,15 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v12, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v12, s[4:5] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[8:11], v12, s[4:5] offset:32 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v12, s[4:5] offset:48 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 @@ -2688,7 +2688,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v12, 0 +; GFX10-NEXT: v_mov_b32_e32 v16, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -2697,10 +2697,10 @@ ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v12, s[4:5] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v12, s[4:5] offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v12, s[4:5] offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v12, s[4:5] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 @@ -2730,19 +2730,19 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v28, 0 +; GFX9-NEXT: v_mov_b32_e32 v32, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 -; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 -; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 -; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 -; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; GFX9-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 +; GFX9-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 +; GFX9-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 @@ -2768,7 +2768,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v28, 0 +; GFX10-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -2777,14 +2777,14 @@ ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 -; GFX10-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 -; GFX10-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 -; GFX10-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 -; GFX10-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 @@ -2814,28 +2814,28 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v28, 0 +; GFX9-NEXT: v_mov_b32_e32 v32, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 -; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 -; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 -; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 -; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; GFX9-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 +; GFX9-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 +; GFX9-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: global_load_dword v33, v[0:1], off ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: global_load_dword v32, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2856,30 +2856,30 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v28, 0 +; GFX10-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: global_load_dword v32, v[0:1], off +; GFX10-NEXT: global_load_dword v33, v[0:1], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 -; GFX10-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 -; GFX10-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 -; GFX10-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 -; GFX10-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 ; GFX10-NEXT: s_waitcnt vmcnt(8) -; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -2921,6 +2921,7 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: global_store_dword v[40:41], v0, off ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s4, v42, 0 ; GFX9-NEXT: v_readlane_b32 s5, v42, 1 @@ -2983,13 +2984,13 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:4 -; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5] +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:4 +; GFX9-NEXT: global_load_ubyte v0, v2, s[4:5] ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 @@ -3015,7 +3016,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -3024,8 +3025,8 @@ ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ubyte v0, v1, s[4:5] -; GFX10-NEXT: global_load_dword v1, v1, s[4:5] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v2, s[4:5] +; GFX10-NEXT: global_load_dword v1, v2, s[4:5] offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 @@ -5722,7 +5723,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 18 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: ; implicit-def: $vcc_hi @@ -5744,8 +5745,8 @@ ; GFX10-NEXT: v_writelane_b32 v40, s51, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 16 ; GFX10-NEXT: v_writelane_b32 v40, s31, 17 ; GFX10-NEXT: s_getpc_b64 s[30:31] @@ -5904,7 +5905,9 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 18 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s22, s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: ; implicit-def: $vcc_hi @@ -5912,6 +5915,8 @@ ; GFX10-NEXT: v_writelane_b32 v40, s37, 1 ; GFX10-NEXT: v_writelane_b32 v40, s38, 2 ; GFX10-NEXT: v_writelane_b32 v40, s39, 3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s22 ; GFX10-NEXT: v_writelane_b32 v40, s40, 4 ; GFX10-NEXT: v_writelane_b32 v40, s41, 5 ; GFX10-NEXT: v_writelane_b32 v40, s42, 6 @@ -5924,24 +5929,21 @@ ; GFX10-NEXT: v_writelane_b32 v40, s49, 13 ; GFX10-NEXT: v_writelane_b32 v40, s50, 14 ; GFX10-NEXT: v_writelane_b32 v40, s51, 15 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s20, s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX10-NEXT: v_writelane_b32 v40, s30, 16 ; GFX10-NEXT: v_writelane_b32 v40, s31, 17 ; GFX10-NEXT: s_getpc_b64 s[30:31] ; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg@rel32@hi+12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-NEXT: v_mov_b32_e32 v2, s48 ; GFX10-NEXT: v_mov_b32_e32 v3, s49 ; GFX10-NEXT: s_mov_b32 s20, s36 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 -; GFX10-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-NEXT: s_mov_b32 s21, s37 ; GFX10-NEXT: s_mov_b32 s22, s38 ; GFX10-NEXT: s_mov_b32 s23, s39 diff --git a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir @@ -1,6 +1,6 @@ -# RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GCX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GCX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s # GCN-LABEL: name: break_smem_clause_simple_load_smrd8_ptr_hidden_bundle # GCN: bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir @@ -1,6 +1,6 @@ -# RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s # GCN-LABEL: name: break_smem_clause_max_look_ahead_in_bundle # GCN: S_LOAD_DWORDX2_IMM diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -1,7 +1,7 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=WAVE64 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=WAVE64 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=WAVE64 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX1010 --check-prefix=WAVE32 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=WAVE64 --check-prefix=NOTES %s +; RUN: llc -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=WAVE64 --check-prefix=NOTES %s +; RUN: llc -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=WAVE64 --check-prefix=NOTES %s +; RUN: llc -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx1010 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX1010 --check-prefix=WAVE32 --check-prefix=NOTES %s @var = addrspace(1) global float 0.0 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=2 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=2 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=2 -mattr=-xnack -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -mattr=-xnack -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s @var = addrspace(1) global float 0.0 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -5,8 +5,8 @@ ; GFX9-LABEL: udiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_sub_i32 s3, 0, s2 @@ -64,8 +64,8 @@ ; GFX9-LABEL: urem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_sub_i32 s3, 0, s2 @@ -121,6 +121,7 @@ ; GFX9-LABEL: sdiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s3, 31 @@ -180,6 +181,7 @@ ; GFX9-LABEL: srem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s3, s2, 31 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -62,57 +62,57 @@ ; GFX9-NODL-LABEL: udot2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s1, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -195,70 +195,70 @@ ; GFX9-NODL-LABEL: udot2_MulMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v1, s2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-NODL-NEXT: v_add_u32_e32 v1, s5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v1, s5, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_add_u32_e32 v1, s9, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MulMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v1, s2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-DL-NEXT: v_add_u32_e32 v1, s5, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v1, s5, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, s9, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MulMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s2, s5 -; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s5, s6 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s4, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: s_and_b32 s5, s0, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s1, s4 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s4, s5 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s8, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -334,56 +334,56 @@ ; GFX9-NODL-LABEL: idot2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot2_i32_i16 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot2_i32_i16 v1, s1, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_i32_i16 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -460,67 +460,67 @@ ; GFX9-NODL-LABEL: idot2_MixedTypedMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedTypedMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedTypedMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -600,57 +600,57 @@ ; GFX9-NODL-LABEL: udot2_alt_AddOperands: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_alt_AddOperands: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s1, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -727,67 +727,67 @@ ; GFX9-NODL-LABEL: idot2_MixedExt: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_and_b32 s6, s3, 0xffff -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, 0xffff +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedExt: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_and_b32 s6, s3, 0xffff -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s1, 0xffff +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedExt: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_ashr_i32 s2, s0, 16 -; GFX10-DL-NEXT: s_ashr_i32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -856,56 +856,56 @@ ; GFX9-NODL-LABEL: notudot2_SameVec: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, s2, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, s4, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, s1, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, s0, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_SameVec: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, s2, v1 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, s4, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, s1, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, s0, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_SameVec: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, s3 -; GFX10-DL-NEXT: s_and_b32 s2, s4, 0xffff -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s1, s8 +; GFX10-DL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s0, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -985,57 +985,57 @@ ; GFX9-NODL-LABEL: udot2_v4i16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s1, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1115,57 +1115,57 @@ ; GFX9-NODL-LABEL: udot2_v4i16_Hi: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x4 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x4 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16_Hi: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x4 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s1, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x4 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1245,70 +1245,70 @@ ; GFX9-NODL-LABEL: notudot2_v4i16_Even: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-NODL-NEXT: s_and_b32 s2, s2, s8 -; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-NODL-NEXT: s_and_b32 s0, s0, s10 +; GFX9-NODL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 -; GFX9-NODL-NEXT: s_and_b32 s4, s4, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: s_and_b32 s4, s8, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Even: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-DL-NEXT: s_and_b32 s2, s2, s8 -; GFX9-DL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-DL-NEXT: s_and_b32 s0, s0, s10 +; GFX9-DL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 -; GFX9-DL-NEXT: s_and_b32 s4, s4, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: s_and_b32 s4, s8, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Even: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s10, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s1, s1, s7 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 -; GFX10-DL-NEXT: s_and_b32 s0, s0, s7 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s1, v0 -; GFX10-DL-NEXT: s_and_b32 s1, s2, s7 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s10 +; GFX10-DL-NEXT: s_and_b32 s1, s1, s4 +; GFX10-DL-NEXT: s_and_b32 s5, s9, s4 +; GFX10-DL-NEXT: s_and_b32 s0, s0, s4 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s1, v0 +; GFX10-DL-NEXT: s_and_b32 s1, s8, s4 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1388,70 +1388,70 @@ ; GFX9-NODL-LABEL: notudot2_v4i16_Middle: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s8, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Middle: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: s_lshr_b32 s4, s8, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Middle: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s10, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s1, s1, s7 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s10 +; GFX10-DL-NEXT: s_and_b32 s1, s1, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s9, s4 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s1, v0 -; GFX10-DL-NEXT: s_lshr_b32 s1, s2, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s1, v0 +; GFX10-DL-NEXT: s_lshr_b32 s1, s8, 16 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1531,70 +1531,70 @@ ; GFX9-NODL-LABEL: notudot2_DiffIndex: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_DiffIndex: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 -; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_DiffIndex: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-DL-NEXT: s_and_b32 s6, s1, s2 -; GFX10-DL-NEXT: s_and_b32 s0, s0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX10-DL-NEXT: s_and_b32 s6, s1, s4 +; GFX10-DL-NEXT: s_and_b32 s0, s0, s4 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1676,73 +1676,73 @@ ; GFX9-NODL-LABEL: udot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v2, v1 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v2, v1 ; GFX9-NODL-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v2, v1 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v2, v1 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX10-DL-NEXT: s_and_b32 s0, s0, s6 ; GFX10-DL-NEXT: s_and_b32 s1, s1, s6 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1823,70 +1823,70 @@ ; GFX9-NODL-LABEL: idot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v2, v1 ; GFX9-NODL-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v1 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v2, v1 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_ashr_i32 s2, s0, 16 -; GFX10-DL-NEXT: s_ashr_i32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s1, s0, v0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1970,73 +1970,73 @@ ; GFX9-NODL-LABEL: udot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s3, s0, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s0, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s1, s4 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2118,70 +2118,70 @@ ; GFX9-NODL-LABEL: idot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v3, v1 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s1, v3, v1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v3, v1 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s1, v3, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_sext_i32_i16 s2, s0 -; GFX10-DL-NEXT: s_sext_i32_i16 s3, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i16 s5, s1 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 16 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2266,73 +2266,73 @@ ; GFX9-NODL-LABEL: udot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v1, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v1, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX10-DL-NEXT: s_and_b32 s0, s0, s6 ; GFX10-DL-NEXT: s_and_b32 s1, s1, s6 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2414,70 +2414,70 @@ ; GFX9-NODL-LABEL: idot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_ashr_i32 s2, s0, 16 -; GFX10-DL-NEXT: s_ashr_i32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2559,40 +2559,40 @@ ; GFX9-NODL-LABEL: udot2_acc16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s8, 16 +; GFX9-NODL-NEXT: s_and_b32 s4, s8, s0 +; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_acc16: @@ -2603,10 +2603,10 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot2_u32_u16 v1, s0, s1, v1 +; GFX10-DL-NEXT: v_dot2_u32_u16 v1, s6, s7, v1 ; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2692,12 +2692,12 @@ ; GFX9-NODL-LABEL: notsdot2_sext8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 @@ -2707,20 +2707,20 @@ ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v4, v3, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notsdot2_sext8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 @@ -2730,9 +2730,9 @@ ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, v4, v3, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notsdot2_sext8: diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -72,64 +72,64 @@ ; GFX9-NODL-LABEL: idot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s8, v2, v1 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s0, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot4_i32_i8 v0, s0, s1, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -247,47 +247,47 @@ ; GFX9-NODL-LABEL: idot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s0, 0x80010 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v2, v1 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v3, v1 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s8, v4, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16: @@ -298,10 +298,10 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_i32_i8 v1, s0, s1, v1 +; GFX10-DL-NEXT: v_dot4_i32_i8 v1, s6, s7, v1 ; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -411,48 +411,48 @@ ; GFX9-NODL-LABEL: idot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s4, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s4, s8, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s8, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s8, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s8, s8, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v3, v1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc8: @@ -463,10 +463,10 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s0, s1, v1 +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s6, s7, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -568,92 +568,92 @@ ; GFX9-NODL-LABEL: idot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v1, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s8, v2, v1 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-DL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-DL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s8, v2, v1 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_sext_i32_i8 s2, s0 -; GFX10-DL-NEXT: s_sext_i32_i8 s3, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s1 ; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x80008 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80010 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x80010 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s0, s1, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -764,93 +764,93 @@ ; GFX9-NODL-LABEL: idot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s6, s3, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s4, s2, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s6, s1, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s4, s0, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s5, s0, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s2, v3, v4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s0, v3, v4 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v1, v2, v3 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX9-DL-NEXT: s_ashr_i32 s6, s3, 24 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80010 -; GFX9-DL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-DL-NEXT: s_ashr_i32 s4, s2, 24 -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x80010 -; GFX9-DL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-DL-NEXT: s_ashr_i32 s6, s1, 24 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s1, s1 +; GFX9-DL-NEXT: s_ashr_i32 s4, s0, 24 +; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, s2, v3, v4 +; GFX9-DL-NEXT: v_mad_i32_i24 v3, s0, v3, v4 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: s_sext_i32_i8 s4, s2 -; GFX10-DL-NEXT: s_sext_i32_i8 s5, s3 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s1 ; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x80010 +; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s0, s1, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -944,15 +944,15 @@ ; GFX9-NODL-LABEL: idot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v4, 8, s5 ; GFX9-NODL-NEXT: s_bfe_i32 s5, s5, 0x80000 ; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v3, 8, s4 @@ -961,37 +961,37 @@ ; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 ; GFX9-NODL-NEXT: v_and_b32_e32 v6, s4, v5 ; GFX9-NODL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 -; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v2, 8, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s2 +; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v2, 8, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s1, s1, 0x80000 +; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s0 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX9-NODL-NEXT: v_and_b32_e32 v4, s3, v5 -; GFX9-NODL-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, s1, v5 +; GFX9-NODL-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 -; GFX9-NODL-NEXT: v_and_b32_e32 v4, s2, v5 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, s0, v5 ; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v4 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v2 ; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX9-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s5 ; GFX9-DL-NEXT: s_bfe_i32 s5, s5, 0x80000 ; GFX9-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s4 @@ -1000,23 +1000,23 @@ ; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 ; GFX9-DL-NEXT: v_and_b32_e32 v6, s4, v5 ; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 -; GFX9-DL-NEXT: v_ashrrev_i16_e64 v2, 8, s3 -; GFX9-DL-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s2 +; GFX9-DL-NEXT: v_ashrrev_i16_e64 v2, 8, s1 +; GFX9-DL-NEXT: s_bfe_i32 s1, s1, 0x80000 +; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s0 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX9-DL-NEXT: v_and_b32_e32 v4, s3, v5 -; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s1, v5 +; GFX9-DL-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 -; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v5 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v5 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16_vecMul: @@ -1028,28 +1028,28 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s0 -; GFX10-DL-NEXT: s_bfe_i32 s0, s0, 0x80000 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000 -; GFX10-DL-NEXT: v_and_b32_e32 v6, s0, v2 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s1 -; GFX10-DL-NEXT: v_and_b32_e32 v5, s3, v2 -; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 16 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 8, s2 +; GFX10-DL-NEXT: s_bfe_i32 s1, s6, 0x80000 +; GFX10-DL-NEXT: s_bfe_i32 s2, s7, 0x80000 +; GFX10-DL-NEXT: v_and_b32_e32 v6, s1, v2 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s6 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s7 +; GFX10-DL-NEXT: v_and_b32_e32 v5, s2, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 16 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 16 ; GFX10-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 -; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x80000 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 8, s0 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x80000 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s0 +; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x80000 +; GFX10-DL-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX10-DL-NEXT: v_and_b32_e32 v6, s2, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v2, s1, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v5, 16, v6 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s1 ; GFX10-DL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v5, 16, v6 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -74,65 +74,65 @@ ; GFX9-NODL-LABEL: udot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s0, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, s0, s1, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -242,48 +242,48 @@ ; GFX9-NODL-LABEL: udot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_and_b32 s4, s1, s0 +; GFX9-NODL-NEXT: s_and_b32 s0, s8, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s8, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s8, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s8, s8, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v3, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v4, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v4, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16: @@ -294,10 +294,10 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s0, s1, v1 +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s6, s7, v1 ; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -408,48 +408,48 @@ ; GFX9-NODL-LABEL: udot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s4, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s4, s8, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s8, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s8, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s8, s8, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v3, v1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8: @@ -460,10 +460,10 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s0, s1, v1 +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s6, s7, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -551,47 +551,47 @@ ; GFX9-NODL-LABEL: udot2_8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s4, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s3, s3, 0x80008 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: s_and_b32 s4, s8, s0 +; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_bfe_u32 s5, s8, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s5, s4, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s3, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x80008 -; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x80008 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-DL-NEXT: s_and_b32 s4, s8, s0 +; GFX9-DL-NEXT: s_and_b32 s0, s1, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: s_bfe_u32 s5, s8, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_8: @@ -602,17 +602,17 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s1, 0xff +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s0, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s3, s2, s1 -; GFX10-DL-NEXT: s_and_b32 s1, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX10-DL-NEXT: s_and_b32 s1, s7, s0 +; GFX10-DL-NEXT: s_and_b32 s0, s6, s0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x80008 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s7, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s6, 0x80008 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -704,48 +704,48 @@ ; GFX9-NODL-LABEL: udot4_CommutationInsideMAD: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_and_b32 s4, s1, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s0, s8, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s8, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s8, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NODL-NEXT: s_lshr_b32 s8, s8, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v3, v1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s3, v2, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s1, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationInsideMAD: @@ -756,10 +756,10 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s1, s0, v1 +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s7, s6, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -863,63 +863,63 @@ ; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s4, s1, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s8, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_and_b32 s0, s8, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s8, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NODL-NEXT: s_lshr_b32 s8, s8, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v3, v1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-DL-NEXT: s_and_b32 s4, s1, s0 +; GFX9-DL-NEXT: s_bfe_u32 s6, s8, 0x80008 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-DL-NEXT: s_and_b32 s0, s8, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: s_bfe_u32 s9, s8, 0x80010 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-DL-NEXT: s_lshr_b32 s8, s8, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v3, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v3, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: @@ -927,25 +927,25 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_movk_i32 s6, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v1 -; GFX10-DL-NEXT: s_and_b32 s2, s0, s6 -; GFX10-DL-NEXT: s_and_b32 s3, s1, s6 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v1 +; GFX10-DL-NEXT: s_and_b32 s0, s6, s2 +; GFX10-DL-NEXT: s_and_b32 s1, s7, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v1 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm @@ -1051,95 +1051,95 @@ ; GFX9-NODL-LABEL: udot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v2 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v2 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x80010 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s4, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s3, s0, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s0, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s1, s4 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s1, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1254,98 +1254,98 @@ ; GFX9-NODL-LABEL: udot4_multiuse_add1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v1, v2 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, s10, v1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v3, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v3, v1 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v3, v1 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v3, v1 ; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v1, v2 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX9-DL-NEXT: v_add_u32_e32 v2, s10, v1 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v3, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v3, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v3, v1 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v3, v1 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v3, v1 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_movk_i32 s7, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s6, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 -; GFX10-DL-NEXT: s_and_b32 s2, s0, s7 -; GFX10-DL-NEXT: s_and_b32 s3, s1, s7 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v0 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: s_and_b32 s4, s0, s6 +; GFX10-DL-NEXT: s_and_b32 s5, s1, s6 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s5, v0 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s5, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s8, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1457,61 +1457,61 @@ ; GFX9-NODL-LABEL: notdot4_mixedtypes: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s2, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v3, v1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v4, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notdot4_mixedtypes: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x80008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX9-DL-NEXT: s_sext_i32_i8 s5, s3 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-DL-NEXT: s_sext_i32_i8 s5, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-DL-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s4, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v3, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v4, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notdot4_mixedtypes: @@ -1522,21 +1522,21 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_sext_i32_i8 s2, s0 -; GFX10-DL-NEXT: s_sext_i32_i8 s3, s1 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_sext_i32_i8 s0, s6 +; GFX10-DL-NEXT: s_sext_i32_i8 s1, s7 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm @@ -1650,95 +1650,95 @@ ; GFX9-NODL-LABEL: udot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 24 -; GFX9-NODL-NEXT: s_lshr_b32 s6, s4, 24 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX9-NODL-NEXT: s_and_b32 s3, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s4 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-NODL-NEXT: s_and_b32 s0, s0, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v3, s3, v3, v4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NODL-NEXT: v_mad_u32_u24 v3, s0, v3, v4 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v2, v3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 24 -; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 24 -; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX9-DL-NEXT: s_and_b32 s3, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s4 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-DL-NEXT: s_and_b32 s0, s0, s8 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s3, v3, v4 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, s0, v3, v4 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s6, 0xff -; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s5, 0xff +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: s_and_b32 s4, s2, s6 -; GFX10-DL-NEXT: s_and_b32 s6, s3, s6 -; GFX10-DL-NEXT: v_and_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_and_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-DL-NEXT: s_and_b32 s6, s0, s5 +; GFX10-DL-NEXT: s_and_b32 s5, s1, s5 +; GFX10-DL-NEXT: v_and_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s6, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s1, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1833,71 +1833,71 @@ ; GFX9-NODL-LABEL: udot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s7, s1, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-NODL-NEXT: v_and_b32_sdwa v5, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v3, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_lshl_or_b32 v4, s6, 16, v4 ; GFX9-NODL-NEXT: v_lshl_or_b32 v5, s4, 16, v5 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX9-NODL-NEXT: v_and_b32_sdwa v5, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-NODL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v5, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v3, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 ; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v2 ; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v4 ; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-DL-NEXT: v_and_b32_sdwa v5, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 24 +; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-DL-NEXT: v_and_b32_sdwa v4, v3, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_lshl_or_b32 v4, s6, 16, v4 ; GFX9-DL-NEXT: v_lshl_or_b32 v5, s4, 16, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX9-DL-NEXT: v_and_b32_sdwa v5, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-DL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_and_b32_sdwa v5, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-DL-NEXT: v_and_b32_sdwa v3, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v4 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16_vecMul: @@ -1909,24 +1909,24 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_lshr_b32 s2, s1, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s6 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s7 +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: s_lshr_b32 s0, s7, 16 +; GFX10-DL-NEXT: s_lshr_b32 s1, s6, 16 ; GFX10-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: s_lshr_b32 s1, s6, 24 +; GFX10-DL-NEXT: s_lshr_b32 s0, s7, 24 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, s1, 16, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, s0, 16, v2 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, s0, 16, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, s1, 16, v2 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v3, v1 @@ -2050,23 +2050,23 @@ ; GFX9-NODL-LABEL: udot4_acc8_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: global_load_ubyte v4, v0, s[0:1] +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ubyte v4, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 24 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s2, v1 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v2, s2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s0, v1 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v2, s0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v2, s5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s4, v3 @@ -2079,29 +2079,29 @@ ; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ubyte v4, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v4, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 24 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s2, v1 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, s2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s0, v1 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, s0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, s5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s4, v3 @@ -2114,7 +2114,7 @@ ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8_vecMul: @@ -2125,18 +2125,18 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s0 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s1 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 24 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s2, s3 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s6 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s7 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 24 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, v2, v3 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s0, s1 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s6, s7 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 16 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 16 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v2 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v4 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -3,10 +3,10 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc32: @@ -118,77 +118,122 @@ ; GFX9-LABEL: idot8_acc32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 -; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mad_i32_i24 v1, s4, v1, v2 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x4000c ; GFX9-NEXT: v_mad_i32_i24 v1, s8, v2, v1 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v2, s11 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-NEXT: v_mad_i32_i24 v1, s10, v2, v1 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v1, s12, v2, v1 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v2, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v1, s14, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: v_mad_i32_i24 v1, s16, v2, v1 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mad_i32_i24 v1, s2, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mad_i32_i24 v1, s0, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s0, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; +; GFX10-DL-XNACK-LABEL: idot8_acc32: +; GFX10-DL-XNACK: ; %bb.0: ; %entry +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 +; GFX10-DL-XNACK-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-DL-XNACK-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc32: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NOXNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 +; GFX10-DL-NOXNACK-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 @@ -411,47 +456,47 @@ ; ; GFX9-LABEL: idot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40000 -; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40004 -; GFX9-NEXT: s_bfe_i32 s11, s3, 0x40008 +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: s_lshr_b32 s4, s2, 12 -; GFX9-NEXT: s_lshr_b32 s5, s3, 12 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-NEXT: s_lshr_b32 s4, s0, 12 +; GFX9-NEXT: s_lshr_b32 s5, s1, 12 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: v_mov_b32_e32 v6, s9 ; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s4 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s5 ; GFX9-NEXT: v_mul_i32_i24_e32 v2, s10, v2 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018 +; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v8, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_i32_i24 v1, s6, v5, v1 ; GFX9-NEXT: v_mad_i32_i24 v1, s8, v6, v1 @@ -460,54 +505,54 @@ ; GFX9-NEXT: v_mad_i32_i24 v1, s12, v7, v1 ; GFX9-NEXT: v_mad_i32_i24 v1, s14, v8, v1 ; GFX9-NEXT: v_mad_i32_i24 v1, s16, v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mad_i32_i24 v1, s2, v2, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mad_i32_i24 v1, s0, v2, v1 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x40008 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 12 -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 12 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 12 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 12 +; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s9 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, s10, v2 -; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s15 -; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v5, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s8, v6, v1 @@ -516,11 +561,116 @@ ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s12, v7, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s14, v8, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s16, v9, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s0, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; +; GFX10-DL-XNACK-LABEL: idot8_acc16: +; GFX10-DL-XNACK: ; %bb.0: ; %entry +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: global_load_ushort v1, v0, s[4:5] +; GFX10-DL-XNACK-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s0, s6, 12 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s1, s7, 12 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s2, s6, 0x40000 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s3, s7, 0x40000 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v2, 12, s0 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v3, 12, s1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s8, s6, 0x40004 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s9, s6, 0x40008 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s10, s7, 0x40008 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s7, 0x40004 +; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e64 v4, s9, s10 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s1, s7, 0x40010 +; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s8, s0, v1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s0, 0xffff +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s6, 0x40010 +; GFX10-DL-XNACK-NEXT: v_mad_u32_u24 v1, v2, v3, v1 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s6, 0x40014 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s1, s7, 0x40014 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s6, 0x40018 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s1, s7, 0x40018 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: s_ashr_i32 s0, s6, 28 +; GFX10-DL-XNACK-NEXT: s_ashr_i32 s1, s7, 28 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DL-XNACK-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc16: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: global_load_ushort v1, v0, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s2, s0, 12 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s3, s1, 12 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s7, s1, 0x40000 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v2, 12, s2 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v3, 12, s3 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s1, 0x40004 +; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e64 v4, s9, s10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v2 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40010 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s6, s7, v1 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s8, s2, v1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40010 +; GFX10-DL-NOXNACK-NEXT: v_mad_u32_u24 v1, v2, v3, v1 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40014 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40014 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-NOXNACK-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 @@ -775,122 +925,227 @@ ; ; GFX9-LABEL: idot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-NEXT: s_movk_i32 s2, 0xff +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s3, 12 -; GFX9-NEXT: s_bfe_i32 s8, s4, 0x40000 -; GFX9-NEXT: s_lshr_b32 s6, s4, 12 -; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40004 -; GFX9-NEXT: s_bfe_i32 s12, s4, 0x40008 -; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s5 -; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s6 -; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40004 -; GFX9-NEXT: s_bfe_i32 s11, s3, 0x40008 +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s8, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-NEXT: s_bfe_i32 s7, s8, 0x40000 +; GFX9-NEXT: s_lshr_b32 s5, s8, 12 +; GFX9-NEXT: s_bfe_i32 s10, s8, 0x40004 +; GFX9-NEXT: s_bfe_i32 s12, s8, 0x40008 +; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s5 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: v_mov_b32_e32 v6, s10 ; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_mul_i32_i24_e32 v2, s11, v2 -; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX9-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: s_bfe_i32 s14, s8, 0x40010 +; GFX9-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX9-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX9-NEXT: s_bfe_i32 s16, s8, 0x40014 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v7, s14 -; GFX9-NEXT: s_bfe_i32 s18, s4, 0x40018 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX9-NEXT: s_bfe_i32 s18, s8, 0x40018 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 -; GFX9-NEXT: s_ashr_i32 s4, s4, 28 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018 +; GFX9-NEXT: s_ashr_i32 s8, s8, 28 ; GFX9-NEXT: v_mov_b32_e32 v9, s18 -; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v1, s7, v5, v1 +; GFX9-NEXT: v_mad_i32_i24 v1, s6, v5, v1 ; GFX9-NEXT: v_mad_i32_i24 v1, s9, v6, v1 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_mad_u32_u24 v1, v3, v4, v1 ; GFX9-NEXT: v_mad_i32_i24 v1, s13, v7, v1 ; GFX9-NEXT: v_mad_i32_i24 v1, s15, v8, v1 ; GFX9-NEXT: v_mad_i32_i24 v1, s17, v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 12 -; GFX9-DL-NEXT: s_bfe_i32 s8, s4, 0x40000 -; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 12 -; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s5 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s6 -; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-DL-NEXT: s_bfe_i32 s7, s8, 0x40000 +; GFX9-DL-NEXT: s_lshr_b32 s5, s8, 12 +; GFX9-DL-NEXT: s_bfe_i32 s10, s8, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s12, s8, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s10 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, s11, v2 -; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX9-DL-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s14, s8, 0x40010 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX9-DL-NEXT: s_bfe_i32 s16, s8, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s14 -; GFX9-DL-NEXT: s_bfe_i32 s18, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s18, s8, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28 +; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s8, s8, 28 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s18 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s7, v5, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v5, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s9, v6, v1 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, v3, v4, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s13, v7, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s15, v8, v1 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s17, v9, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s1, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; +; GFX10-DL-XNACK-LABEL: idot8_acc8: +; GFX10-DL-XNACK: ; %bb.0: ; %entry +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: global_load_ubyte v1, v0, s[4:5] +; GFX10-DL-XNACK-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s0, s6, 12 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s1, s7, 12 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s2, s6, 0x40000 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s3, s7, 0x40000 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v2, 12, s0 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v3, 12, s1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s8, s6, 0x40004 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s9, s6, 0x40008 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s10, s7, 0x40008 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s7, 0x40004 +; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e64 v4, s9, s10 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s1, s7, 0x40010 +; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s8, s0, v1 +; GFX10-DL-XNACK-NEXT: s_movk_i32 s0, 0xff +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s6, 0x40010 +; GFX10-DL-XNACK-NEXT: v_mad_u32_u24 v1, v2, v3, v1 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s6, 0x40014 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s1, s7, 0x40014 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s6, 0x40018 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s1, s7, 0x40018 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: s_ashr_i32 s0, s6, 28 +; GFX10-DL-XNACK-NEXT: s_ashr_i32 s1, s7, 28 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-DL-XNACK-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc8: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v1, v0, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s2, s0, 12 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s3, s1, 12 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s7, s1, 0x40000 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v2, 12, s2 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v3, 12, s3 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s1, 0x40004 +; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e64 v4, s9, s10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v2 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40010 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s6, s7, v1 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s8, s2, v1 +; GFX10-DL-NOXNACK-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40010 +; GFX10-DL-NOXNACK-NEXT: v_mad_u32_u24 v1, v2, v3, v1 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40014 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40014 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-NOXNACK-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 @@ -1127,111 +1382,206 @@ ; GFX9-LABEL: idot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mad_i32_i24 v2, s4, v1, v2 -; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-NEXT: v_mad_i32_i24 v1, s4, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mad_i32_i24 v1, s6, v3, v1 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x4000c ; GFX9-NEXT: v_mad_i32_i24 v1, s8, v3, v1 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-NEXT: v_mad_i32_i24 v1, s10, v3, v1 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v1, s12, v3, v1 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v1, s14, v3, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: v_mad_i32_i24 v1, s16, v3, v1 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_mad_i32_i24 v1, s2, v3, v1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mad_i32_i24 v1, s0, v3, v1 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s18, s[2:3], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v2 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v3, v1 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x4000c ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s8, v3, v1 -; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s10, v3, v1 -; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40018 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s12, v3, v1 -; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s14, v3, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s16, v3, v1 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v3, v1 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s0, v3, v1 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; +; GFX10-DL-XNACK-LABEL: idot8_multiuses_mul1: +; GFX10-DL-XNACK: ; %bb.0: ; %entry +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-DL-XNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v0, s4, s5, v0 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v0 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40004 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40004 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40008 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40008 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x4000c +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x4000c +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40010 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40010 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40014 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40014 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40018 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40018 +; GFX10-DL-XNACK-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-XNACK-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-DL-XNACK-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-DL-NOXNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40000 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v0, s2, s3, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v0 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40004 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40004 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40008 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x4000c +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x4000c +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40010 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40010 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40014 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40014 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 @@ -1460,38 +1810,38 @@ ; GFX9-LABEL: idot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s2, 28 -; GFX9-NEXT: s_ashr_i32 s11, s3, 28 -; GFX9-NEXT: s_bfe_i32 s12, s3, 0x40018 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40014 -; GFX9-NEXT: s_bfe_i32 s14, s3, 0x40010 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x4000c -; GFX9-NEXT: s_bfe_i32 s16, s3, 0x40008 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40004 -; GFX9-NEXT: s_bfe_i32 s3, s3, 0x40000 -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s2, s2, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s4, s0, 28 +; GFX9-NEXT: s_ashr_i32 s11, s1, 28 +; GFX9-NEXT: s_bfe_i32 s12, s1, 0x40018 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40010 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x4000c +; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40008 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s1, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40014 +; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x4000c +; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s0, s0, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mad_i32_i24 v1, s2, v1, v2 +; GFX9-NEXT: v_mad_i32_i24 v1, s0, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 ; GFX9-NEXT: v_mad_i32_i24 v1, s10, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 @@ -1506,31 +1856,76 @@ ; GFX9-NEXT: v_mad_i32_i24 v1, s5, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: v_mad_i32_i24 v1, s4, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s0, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; +; GFX10-DL-XNACK-LABEL: idot8_acc32_vecMul: +; GFX10-DL-XNACK: ; %bb.0: ; %entry +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 +; GFX10-DL-XNACK-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-DL-XNACK-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NOXNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 +; GFX10-DL-NOXNACK-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 @@ -1710,33 +2105,35 @@ ; ; GFX9-LABEL: idot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_and_b32 s11, s2, 15 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10 -; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s8 -; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 +; GFX9-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-NEXT: s_and_b32 s11, s0, 15 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s9, s10 +; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s5, s8 +; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s4 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010 @@ -1745,25 +2142,24 @@ ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-NEXT: s_and_b32 s17, s6, 15 ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s17, s6 -; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s17, s6 +; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v5 -; GFX9-NEXT: global_load_ushort v5, v0, s[0:1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s15, s16 -; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s13, s14 +; GFX9-NEXT: global_load_ushort v5, v0, s[2:3] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s15, s16 +; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s13, s14 ; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s7, s12 ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 ; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] @@ -1777,38 +2173,40 @@ ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s5, s8 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s3, s4 +; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s11, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s9, s10 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s8 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s1, s4 ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010 @@ -1817,25 +2215,24 @@ ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s17, s6 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s17, s6 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 -; GFX9-DL-NEXT: global_load_ushort v5, v0, s[0:1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s15, s16 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s13, s14 +; GFX9-DL-NEXT: global_load_ushort v5, v0, s[2:3] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s15, s16 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s13, s14 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v6 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s7, s12 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s7, s12 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v7 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] @@ -1849,9 +2246,154 @@ ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v4 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; +; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul: +; GFX10-DL-XNACK: ; %bb.0: ; %entry +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: global_load_ushort v1, v0, s[4:5] +; GFX10-DL-XNACK-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s0, s6, 0x40018 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s1, s6, 28 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s2, s6, 0x40010 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s3, s6, 0x40014 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s8, s6, 0x40008 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s9, s6, 0x4000c +; GFX10-DL-XNACK-NEXT: s_and_b32 s10, s6, 15 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s6, s6, 0x40004 +; GFX10-DL-XNACK-NEXT: s_and_b32 s11, s7, 15 +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s6, s10, s6 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s10, s7, 0x40004 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, s6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s6, s11, s10 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s11, s7, 0x4000c +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v3, 12, s6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s6, s7, 0x40008 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s6, s6, s11 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v4, 12, s8 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s8, s7, 0x40010 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s6, s7, 0x40014 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v3 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s3, s8, s6 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v6, 12, s3 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v3, v3, v4 +; GFX10-DL-XNACK-NEXT: s_bfe_u32 s10, s7, 0x40018 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s2, s7, 28 +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: s_pack_ll_b32_b16 s1, s10, s2 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v4 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v3, v3, v4 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DL-XNACK-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s3 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: global_load_ushort v1, v0, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s3, s0, 28 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s7, s0, 0x40014 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX10-DL-NOXNACK-NEXT: s_and_b32 s10, s0, 15 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX10-DL-NOXNACK-NEXT: s_and_b32 s11, s1, 15 +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s0, s10, s0 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s10, s1, 0x40004 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s0, s11, s10 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s11, s1, 0x4000c +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s0, s1, 0x40008 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v4, 12, s8 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s8, s1, 0x40010 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s0, s1, 0x40014 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v3 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s0, s8, s0 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v3, v3, v4 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s0, s1, 28 +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s1, s2, s3 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s0, s10, s0 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, s1 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v4 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v3, v3, v4 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NOXNACK-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 @@ -2124,31 +2666,31 @@ ; ; GFX9-LABEL: idot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s9, s3, 4 -; GFX9-NEXT: s_lshr_b32 s16, s4, 4 -; GFX9-NEXT: v_lshlrev_b16_e64 v2, 12, s3 -; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s8, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s9, s1, 4 +; GFX9-NEXT: s_lshr_b32 s16, s8, 4 +; GFX9-NEXT: v_lshlrev_b16_e64 v2, 12, s1 +; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s8 ; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s9 ; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s16 -; GFX9-NEXT: s_lshr_b32 s10, s3, 12 -; GFX9-NEXT: s_lshr_b32 s11, s3, 8 -; GFX9-NEXT: s_lshr_b32 s17, s4, 12 -; GFX9-NEXT: s_lshr_b32 s18, s4, 8 +; GFX9-NEXT: s_lshr_b32 s10, s1, 12 +; GFX9-NEXT: s_lshr_b32 s11, s1, 8 +; GFX9-NEXT: s_lshr_b32 s17, s8, 12 +; GFX9-NEXT: s_lshr_b32 s18, s8, 8 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s11 ; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s10 ; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s18 @@ -2164,24 +2706,24 @@ ; GFX9-NEXT: v_mul_lo_u16_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_lshr_b32 s5, s3, 20 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NEXT: s_lshr_b32 s12, s4, 20 -; GFX9-NEXT: s_lshr_b32 s13, s4, 16 +; GFX9-NEXT: s_lshr_b32 s4, s1, 20 +; GFX9-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NEXT: s_lshr_b32 s12, s8, 20 +; GFX9-NEXT: s_lshr_b32 s13, s8, 16 ; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v4, v4, v11 -; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s6 -; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s5 +; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4 ; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s13 ; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s12 -; GFX9-NEXT: s_lshr_b32 s7, s3, 28 -; GFX9-NEXT: s_lshr_b32 s8, s3, 24 -; GFX9-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-NEXT: s_lshr_b32 s15, s4, 24 -; GFX9-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX9-NEXT: s_lshr_b32 s6, s1, 28 +; GFX9-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-NEXT: s_lshr_b32 s14, s8, 28 +; GFX9-NEXT: s_lshr_b32 s15, s8, 24 +; GFX9-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s8 -; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s7 +; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s6 ; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s15 ; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s14 ; GFX9-NEXT: v_or_b32_e32 v4, v2, v4 @@ -2200,7 +2742,7 @@ ; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v14 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX9-NEXT: v_or_b32_e32 v5, v3, v7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 @@ -2212,36 +2754,36 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s9, s3, 4 -; GFX9-DL-NEXT: s_lshr_b32 s16, s4, 4 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s3 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 4 +; GFX9-DL-NEXT: s_lshr_b32 s16, s8, 4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s1 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s8 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s16 -; GFX9-DL-NEXT: s_lshr_b32 s10, s3, 12 -; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 8 -; GFX9-DL-NEXT: s_lshr_b32 s17, s4, 12 -; GFX9-DL-NEXT: s_lshr_b32 s18, s4, 8 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 12 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 8 +; GFX9-DL-NEXT: s_lshr_b32 s17, s8, 12 +; GFX9-DL-NEXT: s_lshr_b32 s18, s8, 8 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s18 @@ -2257,24 +2799,24 @@ ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v2, v2, v3 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 20 -; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s12, s4, 20 -; GFX9-DL-NEXT: s_lshr_b32 s13, s4, 16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 20 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s12, s8, 20 +; GFX9-DL-NEXT: s_lshr_b32 s13, s8, 16 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v4, v4, v11 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s13 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s12 -; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 28 -; GFX9-DL-NEXT: s_lshr_b32 s8, s3, 24 -; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-DL-NEXT: s_lshr_b32 s15, s4, 24 -; GFX9-DL-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 28 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-DL-NEXT: s_lshr_b32 s14, s8, 28 +; GFX9-DL-NEXT: s_lshr_b32 s15, s8, 24 +; GFX9-DL-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX9-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14 ; GFX9-DL-NEXT: v_or_b32_e32 v4, v2, v4 @@ -2293,7 +2835,7 @@ ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v14 ; GFX9-DL-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX9-DL-NEXT: v_or_b32_e32 v5, v3, v7 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 @@ -2305,9 +2847,204 @@ ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; +; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul: +; GFX10-DL-XNACK: ; %bb.0: ; %entry +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s22, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s23, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s20, s20, s3 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s21, s21, 0 +; GFX10-DL-XNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: global_load_ubyte v1, v0, s[4:5] +; GFX10-DL-XNACK-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s0, 0xffff +; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s9, s6, 4 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s16, s7, 4 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v6, 12, s9 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v12, 12, s16 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s10, s6, 12 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s17, s7, 12 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v2, 12, s6 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v3, 12, s7 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v13, 12, s17 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v5, 12, s10 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v12, 12, v12 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s11, s6, 8 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s18, s7, 8 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v4, 12, s11 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v11, 12, s18 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v19, 12, v5 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v6, v6, v12 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v13, 12, v13 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v4, 12, v4 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v11, 12, v11 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v2, v2, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v5, 8, v6 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v3, v19, v13 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s1, s6, 20 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s2, s6, 16 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s3, s6, 28 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s8, s6, 24 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s12, s7, 20 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v7, 12, s8 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v8, 12, s3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v9, 12, s2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v10, 12, s1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v12, 12, s12 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v4, v4, v11 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v3, 8, v3 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s13, s7, 16 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s14, s7, 28 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v6, 12, s13 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v5, 12, v7 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v8 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v8, 12, v9 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v15, 12, s14 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v4, 12, v10 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v9, 12, v12 +; GFX10-DL-XNACK-NEXT: s_lshr_b32 s15, s7, 24 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v10, 12, v15 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v14, 12, s15 +; GFX10-DL-XNACK-NEXT: v_or_b32_e32 v3, v2, v3 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v4, v4, v9 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v15, v8, v6 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v7, v7, v10 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v11, 12, v14 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v2, 8, v4 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v4, v5, v11 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v5, 8, v7 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v1, v1, v8 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-XNACK-NEXT: v_or_b32_e32 v3, v2, v4 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-XNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-DL-XNACK-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s22, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s23, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s20, s20, s3 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s21, s21, 0 +; GFX10-DL-NOXNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v1, v0, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s9, s0, 4 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s16, s1, 4 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v6, 12, s9 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v12, 12, s16 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s10, s0, 12 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s17, s1, 12 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v2, 12, s0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v3, 12, s1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v13, 12, s17 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v5, 12, s10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v12, 12, v12 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s11, s0, 8 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s18, s1, 8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v4, 12, s11 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v11, 12, s18 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v2 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v19, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v6, v6, v12 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v13, 12, v13 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v4, 12, v4 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v11, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v2, v2, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v5, 8, v6 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v3, v19, v13 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s3, s0, 20 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s7, s0, 28 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s12, s1, 20 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v7, 12, s8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v8, 12, s7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v9, 12, s6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v10, 12, s3 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v12, 12, s12 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v4, v4, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v3, 8, v3 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s13, s1, 16 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s14, s1, 28 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v6, 12, s13 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v5, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v8, 12, v9 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v15, 12, s14 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v4, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v9, 12, v12 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s15, s1, 24 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v10, 12, v15 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v14, 12, s15 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_e32 v3, v2, v3 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v4, v4, v9 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v15, v8, v6 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v7, v7, v10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v11, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v2, 8, v4 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v4, v5, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v5, 8, v7 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v1, v1, v8 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_e32 v3, v2, v4 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NOXNACK-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -115,18 +115,20 @@ ; ; GFX9-LABEL: udot8_acc32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 @@ -136,17 +138,17 @@ ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mad_u32_u24 v1, s2, v1, v2 +; GFX9-NEXT: v_mad_u32_u24 v1, s0, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 ; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 @@ -160,54 +162,53 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s0, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s0, s1, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -389,37 +390,37 @@ ; ; GFX9-LABEL: udot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40008 -; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40004 -; GFX9-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-NEXT: s_and_b32 s3, s3, 15 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-NEXT: s_and_b32 s1, s1, 15 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: v_mov_b32_e32 v4, s16 ; GFX9-NEXT: v_mov_b32_e32 v5, s15 @@ -427,7 +428,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s0, v2, v1 ; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1 @@ -437,42 +438,42 @@ ; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s17, s3, 0x40004 -; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-DL-NEXT: s_and_b32 s3, s3, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s17, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 @@ -480,7 +481,7 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 @@ -490,7 +491,7 @@ ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: @@ -507,34 +508,34 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_and_b32 s0, s6, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s7, 15 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40008 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm @@ -718,37 +719,37 @@ ; ; GFX9-LABEL: udot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40008 -; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40004 -; GFX9-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-NEXT: s_and_b32 s3, s3, 15 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-NEXT: s_and_b32 s1, s1, 15 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: v_mov_b32_e32 v4, s16 ; GFX9-NEXT: v_mov_b32_e32 v5, s15 @@ -756,7 +757,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s0, v2, v1 ; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1 @@ -766,42 +767,42 @@ ; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s17, s3, 0x40004 -; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-DL-NEXT: s_and_b32 s3, s3, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s17, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 @@ -809,7 +810,7 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v2, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 @@ -819,7 +820,7 @@ ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: @@ -836,34 +837,34 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_and_b32 s0, s6, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s7, 15 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40008 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm @@ -1051,41 +1052,41 @@ ; ; GFX9-LABEL: udot8_acc4: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s10, s2, 15 -; GFX9-NEXT: s_and_b32 s17, s3, 15 -; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s10, s0, 15 +; GFX9-NEXT: s_and_b32 s17, s1, 15 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40008 -; GFX9-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: v_mul_u32_u24_e32 v2, s2, v2 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, s0, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_mov_b32_e32 v6, s14 ; GFX9-NEXT: v_mov_b32_e32 v7, s13 @@ -1102,46 +1103,46 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s10, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s17, s3, 15 -; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_and_b32 s10, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s17, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s2, v2 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s0, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 @@ -1158,7 +1159,7 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4: @@ -1175,36 +1176,36 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s0, s6, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s7, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s7, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s7, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s3, s7 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s6, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s6, 0x4000c +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s1, s3 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s2, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] @@ -1377,41 +1378,41 @@ ; ; GFX9-LABEL: udot8_CommutationInsideMAD: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s10, s2, 15 -; GFX9-NEXT: s_and_b32 s17, s3, 15 -; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s10, s0, 15 +; GFX9-NEXT: s_and_b32 s17, s1, 15 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40008 -; GFX9-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: v_mul_u32_u24_e32 v2, s2, v2 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, s0, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_mov_b32_e32 v6, s14 ; GFX9-NEXT: v_mov_b32_e32 v7, s13 @@ -1428,46 +1429,46 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s10, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s17, s3, 15 -; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_and_b32 s10, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s17, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s2, v2 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s0, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 @@ -1484,7 +1485,7 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: @@ -1501,36 +1502,36 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 +; GFX10-DL-NEXT: s_and_b32 s0, s6, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s7, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s6, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s7, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x4000c +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s0, s1 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s2, s3 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] @@ -1700,18 +1701,20 @@ ; ; GFX9-LABEL: udot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 @@ -1721,19 +1724,19 @@ ; GFX9-NEXT: s_bfe_u32 s15, s6, 0x4000c ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v1, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s0, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mad_u32_u24 v1, s2, v1, v2 +; GFX9-NEXT: v_mad_u32_u24 v1, s0, v1, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s11, v3, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s16 ; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2 @@ -1746,26 +1749,27 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: v_mad_u32_u24 v2, s4, v3, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-DL-NEXT: s_lshr_b32 s7, s6, 28 @@ -1775,19 +1779,19 @@ ; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x4000c ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-DL-NEXT: s_and_b32 s6, s6, 15 -; GFX9-DL-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 +; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v1, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v1, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s11, v3, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s16 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v3, v2 @@ -1800,40 +1804,39 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_and_b32 s4, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s5, s1, 15 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v0 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c @@ -1851,7 +1854,7 @@ ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2032,18 +2035,20 @@ ; ; GFX9-LABEL: udot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 @@ -2053,17 +2058,17 @@ ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mad_u32_u24 v1, s2, v1, v2 +; GFX9-NEXT: v_mad_u32_u24 v1, s0, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 ; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 @@ -2077,54 +2082,53 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s0, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s0, s1, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2281,49 +2285,49 @@ ; ; GFX9-LABEL: udot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: global_load_ushort v5, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: global_load_ushort v5, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX9-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 ; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010 ; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v1, s3, v1 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s13, s14 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, s1, v1 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s13, s14 ; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-NEXT: s_and_b32 s17, s6, 15 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s15, s16 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s15, s16 ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s17, s6 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_and_b32 s11, s2, 15 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s17, s6 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-NEXT: s_and_b32 s11, s0, 15 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s8 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, s2, v4 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, s0, v4 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, s4, v2 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s9, s10 ; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v3 @@ -2336,54 +2340,54 @@ ; GFX9-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-DL-NEXT: global_load_ushort v5, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ushort v5, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX9-DL-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 ; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010 ; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, s3, v1 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s13, s14 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, s1, v1 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s13, s14 ; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40008 ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s15, s16 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s15, s16 ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s17, s6 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s17, s6 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s11, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s5, s8 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s2, v4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s0, v4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s4, v2 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s9, s10 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v3 @@ -2396,7 +2400,7 @@ ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: @@ -2413,42 +2417,42 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s7 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, s2, s3 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX10-DL-NEXT: s_and_b32 s0, s6, 15 +; GFX10-DL-NEXT: s_bfe_u32 s3, s6, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s1, s7, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s7, 0x40004 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s7, 0x40008 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, s0, s1 +; GFX10-DL-NEXT: s_bfe_u32 s3, s7, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s6, 0x4000c ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40014 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s6 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-DL-NEXT: s_bfe_u32 s1, s6, 0x40014 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s2, s7, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s3, s7, 0x40014 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s3, s1 +; GFX10-DL-NEXT: s_lshr_b32 s1, s6, 28 +; GFX10-DL-NEXT: s_lshr_b32 s3, s7, 28 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, s2, s6 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, s0, s2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s2, s7, 0x40018 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v3 @@ -2647,61 +2651,61 @@ ; ; GFX9-LABEL: udot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s5, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s15, s4, 28 -; GFX9-NEXT: s_and_b32 s16, s4, 15 -; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX9-NEXT: s_bfe_u32 s18, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s12, s8, 0x40010 +; GFX9-NEXT: s_bfe_u32 s13, s8, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s8, 0x40018 +; GFX9-NEXT: s_lshr_b32 s15, s8, 28 +; GFX9-NEXT: s_and_b32 s16, s8, 15 +; GFX9-NEXT: s_bfe_u32 s17, s8, 0x40004 +; GFX9-NEXT: s_bfe_u32 s18, s8, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s6, s3, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s8, 0x4000c +; GFX9-NEXT: s_bfe_u32 s5, s1, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: s_bfe_u32 s7, s3, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s1, 0x40018 ; GFX9-NEXT: v_mov_b32_e32 v4, s14 -; GFX9-NEXT: s_lshr_b32 s8, s3, 28 +; GFX9-NEXT: s_lshr_b32 s7, s1, 28 ; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: s_and_b32 s9, s3, 15 +; GFX9-NEXT: s_and_b32 s9, s1, 15 ; GFX9-NEXT: v_mov_b32_e32 v6, s16 -; GFX9-NEXT: s_bfe_u32 s10, s3, 0x40004 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v7, s17 -; GFX9-NEXT: s_bfe_u32 s11, s3, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v8, s18 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v9, s4 -; GFX9-NEXT: v_mul_lo_u16_e32 v2, s5, v2 -; GFX9-NEXT: v_mul_lo_u16_sdwa v3, s6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v4, s7, v4 -; GFX9-NEXT: v_mul_lo_u16_sdwa v5, s8, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v9, s8 +; GFX9-NEXT: v_mul_lo_u16_e32 v2, s4, v2 +; GFX9-NEXT: v_mul_lo_u16_sdwa v3, s5, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v4, s6, v4 +; GFX9-NEXT: v_mul_lo_u16_sdwa v5, s7, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v6, s9, v6 ; GFX9-NEXT: v_mul_lo_u16_sdwa v7, s10, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v4, v6, v7 ; GFX9-NEXT: v_mul_lo_u16_e32 v8, s11, v8 -; GFX9-NEXT: v_mul_lo_u16_sdwa v9, s3, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-NEXT: v_mul_lo_u16_sdwa v9, s1, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX9-NEXT: v_or_b32_sdwa v5, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v5 -; GFX9-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX9-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 @@ -2713,66 +2717,66 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s5, s3, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s15, s4, 28 -; GFX9-DL-NEXT: s_and_b32 s16, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s18, s4, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s12, s8, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s13, s8, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s8, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s15, s8, 28 +; GFX9-DL-NEXT: s_and_b32 s16, s8, 15 +; GFX9-DL-NEXT: s_bfe_u32 s17, s8, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s18, s8, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s8, s8, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x40018 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s14 -; GFX9-DL-NEXT: s_lshr_b32 s8, s3, 28 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 28 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-DL-NEXT: s_and_b32 s9, s3, 15 +; GFX9-DL-NEXT: s_and_b32 s9, s1, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s16 -; GFX9-DL-NEXT: s_bfe_u32 s10, s3, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s17 -; GFX9-DL-NEXT: s_bfe_u32 s11, s3, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s18 -; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s4 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v2, s5, v2 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v3, s6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v4, s7, v4 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, s8, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s8 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v2, s4, v2 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v3, s5, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v4, s6, v4 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, s7, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v6, s9, v6 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, s10, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-DL-NEXT: v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v4, v6, v7 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, s11, v8 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, s3, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, s1, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX9-DL-NEXT: v_or_b32_sdwa v5, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v5 -; GFX9-DL-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v1, v4, v1 @@ -2784,7 +2788,7 @@ ; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: @@ -2801,45 +2805,45 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, s3, s7 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s2, s3 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s8, s7 +; GFX10-DL-NEXT: s_bfe_u32 s1, s6, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s3, s7, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s0, s6, 15 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, s1, s3 +; GFX10-DL-NEXT: s_and_b32 s1, s7, 15 +; GFX10-DL-NEXT: s_bfe_u32 s8, s6, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s3, s7, 0x4000c +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s0, s1 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s8, s3 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v2 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s2, s1, 0x40008 -; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s6, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s7, 0x40008 +; GFX10-DL-NEXT: s_mov_b32 s1, 0xffff +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s0 ; GFX10-DL-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: v_and_b32_e32 v2, s3, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s6, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s8, s7, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40010 +; GFX10-DL-NEXT: v_and_b32_e32 v2, s1, v2 ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s6, s8 -; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s9, s1, 0x40010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s2, s8 +; GFX10-DL-NEXT: s_bfe_u32 s3, s6, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s9, s7, 0x40010 +; GFX10-DL-NEXT: s_lshr_b32 s6, s6, 28 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3 -; GFX10-DL-NEXT: s_lshr_b32 s6, s1, 28 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s9 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s0, s6 +; GFX10-DL-NEXT: s_lshr_b32 s2, s7, 28 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s0, s9 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s6, s2 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40018 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s7, s0 +; GFX10-DL-NEXT: s_bfe_u32 s0, s7, 0x40018 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s3, s0 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v6 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s3, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s1, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3002,41 +3006,41 @@ ; ; GFX9-LABEL: udot8_acc4_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s10, s2, 15 -; GFX9-NEXT: s_and_b32 s17, s3, 15 -; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s10, s0, 15 +; GFX9-NEXT: s_and_b32 s17, s1, 15 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40008 -; GFX9-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: v_mul_u32_u24_e32 v2, s2, v2 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, s0, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_mov_b32_e32 v6, s14 ; GFX9-NEXT: v_mov_b32_e32 v7, s13 @@ -3053,46 +3057,46 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s10, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s17, s3, 15 -; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_and_b32 s10, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s17, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 -; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s2, v2 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s0, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 @@ -3109,7 +3113,7 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: @@ -3126,36 +3130,36 @@ ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s0, s6, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s7, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s7, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s7, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s3, s7 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s6, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s6, 0x4000c +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s1, s3 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s2, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_lshr_b32 s0, s6, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s7, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] @@ -3288,34 +3292,34 @@ ; GFX9-LABEL: udot8_variant1: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, 15 -; GFX9-NEXT: s_and_b32 s5, s3, 15 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s16, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: s_and_b32 s4, s0, 15 +; GFX9-NEXT: s_and_b32 s5, s1, 15 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mad_u32_u24 v1, s5, v1, v2 -; GFX9-NEXT: s_bfe_u32 s7, s3, 0x40004 -; GFX9-NEXT: s_bfe_u32 s9, s3, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40018 -; GFX9-NEXT: s_lshr_b32 s3, s3, 28 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x40004 +; GFX9-NEXT: s_bfe_u32 s9, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s17, s1, 0x40018 +; GFX9-NEXT: s_lshr_b32 s1, s1, 28 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mad_u32_u24 v1, s1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mad_u32_u24 v1, s7, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 @@ -3328,40 +3332,40 @@ ; GFX9-NEXT: v_mad_u32_u24 v1, s15, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mad_u32_u24 v1, s17, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_variant1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s4, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s1, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_variant1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s1, s0, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm i32 addrspace(1)* %v2addr, i32 addrspace(1)* %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s ; FIXME: Merge into imm.ll diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1,7 +1,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,IDXMODE-NO-XNACK,PREGFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,IDXMODE-XNACK,GFX9 %s ; Tests for indirect addressing on SI, which is implemented using dynamic ; indexing of vectors. @@ -273,7 +273,8 @@ ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} ; MOVREL: v_movreld_b32_e32 v0, 5 -; IDXMODE: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0xfffffe00{{$}} +; IDXMODE-NO-XNACK: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0xfffffe00{{$}} +; IDXMODE-XNACK: s_addk_i32 s{{[0-9]+}}, 0xfe00{{$}} ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(DST) ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 ; IDXMODE-NEXT: s_set_gpr_idx_off diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -53,7 +53,7 @@ ; GCN-NEXT: is_ptr64 = 1 ; GCN-NEXT: is_dynamic_callstack = 1 ; GCN-NEXT: is_debug_enabled = 0 -; GCN-NEXT: is_xnack_enabled = 1 +; GCN-NEXT: is_xnack_enabled = 0 ; GCN-NEXT: workitem_private_segment_byte_size = 16384 ; GCN-NEXT: workgroup_group_segment_byte_size = 0 ; GCN-NEXT: gds_segment_byte_size = 0 @@ -149,7 +149,7 @@ ; GCN-NEXT: is_ptr64 = 1 ; GCN-NEXT: is_dynamic_callstack = 1 ; GCN-NEXT: is_debug_enabled = 0 -; GCN-NEXT: is_xnack_enabled = 1 +; GCN-NEXT: is_xnack_enabled = 0 ; GCN-NEXT: workitem_private_segment_byte_size = 16384 ; GCN-NEXT: workgroup_group_segment_byte_size = 0 ; GCN-NEXT: gds_segment_byte_size = 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -40,12 +40,12 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s2, s4, s2 +; GFX9-NEXT: s_pack_lh_b32_b16 s2, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -91,13 +91,13 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: ;;#ASMSTART @@ -157,12 +157,12 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s2, s4, s2 +; GFX9-NEXT: s_pack_hh_b32_b16 s2, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -210,11 +210,11 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s3, s4, 16 +; GFX9-NEXT: s_lshr_b32 s3, s6, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_lh_b32_b16 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -275,11 +275,11 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s3, s4, 16 +; GFX9-NEXT: s_lshr_b32 s3, s6, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s3, s2 @@ -387,12 +387,12 @@ ; GFX9-LABEL: s_insertelement_v2i16_1_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -562,12 +562,12 @@ ; GFX9-LABEL: v_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s4 +; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -1039,17 +1039,17 @@ ; GFX9-LABEL: s_insertelement_v2i16_dynamic: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, s4, 4 -; GFX9-NEXT: s_lshl_b32 s3, 0xffff, s3 -; GFX9-NEXT: s_andn2_b32 s2, s2, s3 -; GFX9-NEXT: s_and_b32 s3, s3, 0x3e703e7 -; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: s_lshl_b32 s2, s4, 4 +; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_andn2_b32 s3, s5, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 0x3e703e7 +; GFX9-NEXT: s_or_b32 s2, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -1102,12 +1102,12 @@ ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: s_lshl_b32 s2, s4, 4 +; GFX9-NEXT: s_lshl_b32 s2, s6, 4 ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 @@ -1169,10 +1169,10 @@ ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1252,13 +1252,13 @@ ; GFX9-LABEL: v_insertelement_v4f16_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v0, v3, s4, v0 +; GFX9-NEXT: v_bfi_b32 v0, v3, s6, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1317,13 +1317,13 @@ ; GFX9-LABEL: v_insertelement_v4f16_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, s4, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, s6, 16, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1382,13 +1382,13 @@ ; GFX9-LABEL: v_insertelement_v4f16_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v3, s4, v1 +; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1447,13 +1447,13 @@ ; GFX9-LABEL: v_insertelement_v4f16_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1512,13 +1512,13 @@ ; GFX9-LABEL: v_insertelement_v4i16_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v3, s4, v1 +; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1579,7 +1579,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v2, v[0:1], off ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] @@ -1588,7 +1588,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s6, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, v3, s2, v1 ; GFX9-NEXT: v_bfi_b32 v0, v2, s2, v0 @@ -1667,17 +1667,17 @@ ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6 ; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_lshl_b32 s5, s5, 4 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_lshl_b32 s4, s7, 4 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1 ; GFX9-NEXT: v_bfi_b32 v0, s2, v4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 { ; SI-LABEL: s_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s2, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s4, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_cvt_pkrtz_v2f16_f32: @@ -31,11 +31,11 @@ ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) @@ -46,13 +46,13 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 { ; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s2, s2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s4, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: @@ -69,10 +69,10 @@ ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s0, s0 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, s4 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) @@ -92,21 +92,21 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, v2, v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32: @@ -134,11 +134,11 @@ ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -266,21 +266,21 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: @@ -308,11 +308,11 @@ ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -333,21 +333,21 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: @@ -375,11 +375,11 @@ ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -400,21 +400,21 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: @@ -442,11 +442,11 @@ ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -468,21 +468,21 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: @@ -510,11 +510,11 @@ ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll @@ -32,8 +32,9 @@ } ; GCN-LABEL: {{^}}sample_contig_nsa: -; GCN: image_sample_c_l v0, v[0:7], -; NSA: image_sample v1, [v6, v7, v5], +; NONSA: image_sample_c_l v5, v[0:7], +; NSA: image_sample_c_l v8, v[0:7], +; NSA: image_sample v9, [v6, v7, v5], define amdgpu_ps <2 x float> @sample_contig_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -44,8 +45,8 @@ } ; GCN-LABEL: {{^}}sample_nsa_nsa: -; NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0], -; NSA: image_sample v1, [v6, v7, v5], +; NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0], +; NSA: image_sample v9, [v6, v7, v5], define amdgpu_ps <2 x float> @sample_nsa_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %r2, float %s2, float %t2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -56,8 +57,8 @@ } ; GCN-LABEL: {{^}}sample_nsa_contig: -; NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0], -; NSA: image_sample v1, v[5:7], +; NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0], +; NSA: image_sample v9, v[5:7], define amdgpu_ps <2 x float> @sample_nsa_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %s2, float %t2, float %r2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -68,9 +69,10 @@ } ; GCN-LABEL: {{^}}sample_contig_contig: -; GCN: image_sample_c_l v0, v[0:7], -; NSA: image_sample v1, v[5:7], -; NONSA: image_sample v1, v[5:7], +; NSA: image_sample_c_l v8, v[0:7], +; NSA: image_sample v9, v[5:7], +; NONSA: image_sample_c_l v8, v[0:7], +; NONSA: image_sample v9, v[5:7], define amdgpu_ps <2 x float> @sample_contig_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %s2, float %t2, float %r2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -8,7 +8,7 @@ ; VARIANT0-LABEL: test_barrier: ; VARIANT0: ; %bb.0: ; %entry ; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; VARIANT0-NEXT: s_load_dword s2, s[0:1], 0xb +; VARIANT0-NEXT: s_load_dword s0, s[0:1], 0xb ; VARIANT0-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT0-NEXT: s_mov_b32 s6, 0 ; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -18,7 +18,7 @@ ; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VARIANT0-NEXT: s_barrier -; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s2, v3 +; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s0, v3 ; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 @@ -29,7 +29,7 @@ ; VARIANT1-LABEL: test_barrier: ; VARIANT1: ; %bb.0: ; %entry ; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; VARIANT1-NEXT: s_load_dword s2, s[0:1], 0xb +; VARIANT1-NEXT: s_load_dword s0, s[0:1], 0xb ; VARIANT1-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT1-NEXT: s_mov_b32 s6, 0 ; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -38,7 +38,7 @@ ; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_barrier -; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s2, v3 +; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s0, v3 ; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT1-NEXT: s_waitcnt expcnt(0) @@ -50,11 +50,11 @@ ; VARIANT2-LABEL: test_barrier: ; VARIANT2: ; %bb.0: ; %entry ; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c +; VARIANT2-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT2-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] -; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s0 +; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT2-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT2-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VARIANT2-NEXT: v_mov_b32_e32 v3, s3 @@ -70,11 +70,11 @@ ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry ; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c +; VARIANT3-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT3-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] -; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s0 +; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT3-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT3-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VARIANT3-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -5,14 +5,14 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_bfe_u32 v0, v0, s3, s3 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_bfe_u32 v0, v0, s5, s5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_arg_arg_arg: @@ -34,15 +34,15 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_arg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfe_u32 v0, s2, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_bfe_u32 v0, s4, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_arg_arg_imm: @@ -65,15 +65,15 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_imm_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfe_u32 v0, s2, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_bfe_u32 v0, s4, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_arg_imm_arg: @@ -96,16 +96,16 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_imm_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s0, 0x7b +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_movk_i32 s6, 0x7b ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfe_u32 v0, s0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_bfe_u32 v0, s6, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_imm_arg_arg: @@ -1590,13 +1590,13 @@ ; SI-LABEL: lshr_and: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s2, 0x30006 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: lshr_and: @@ -1619,15 +1619,15 @@ define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; SI-LABEL: v_lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s0, s2, s3 -; SI-NEXT: s_and_b32 s0, s0, 7 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_lshr_b32 s2, s4, s5 +; SI-NEXT: s_and_b32 s4, s2, 7 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_and: @@ -1652,13 +1652,13 @@ ; SI-LABEL: and_lshr: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s2, 0x30006 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: and_lshr: @@ -1682,13 +1682,13 @@ ; SI-LABEL: and_lshr2: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s2, 0x30006 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: and_lshr2: @@ -1712,13 +1712,13 @@ ; SI-LABEL: shl_lshr: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s2, 0x150002 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bfe_u32 s4, s2, 0x150002 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_lshr: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -292,17 +292,17 @@ ; GFX9-LABEL: maxnum_v2f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s6, s6 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -529,12 +529,12 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: v_pk_max_f16 v0, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 @@ -643,12 +643,12 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -315,17 +315,17 @@ ; GFX9-LABEL: minnum_v2f16_ieee: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s6, s6 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -582,12 +582,12 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: v_pk_max_f16 v0, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 @@ -696,12 +696,12 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -3636,98 +3636,98 @@ ; GCN-NOHSA-SI: ; %bb.0: ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x10 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x10 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s37, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s36, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s22, s37 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s23, s36 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s39, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s38, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s26, s39 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s27, s38 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s28, s41, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s40, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s30, s41 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s31, s40 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s43, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s42, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s35, s43 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s36, s42 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s45, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s44, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s39, s45 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s40, s44 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s47, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s42, s46, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s43, s47 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s44, s46 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s49, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s48, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s47, s49 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s48, s48 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s49, s51, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s52, s50, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s51, s51 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s50, s50 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s53, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s54, s4, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s4, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s6, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s22, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s6, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s9, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s8, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s9, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s8, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s10, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s60, s11 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s26, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s10, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s12, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s28, s13, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s12, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s15, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s64, s14, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s30, s15, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s14, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s17, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s16, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s17, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s16, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s17, s17 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s16, s16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s19, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s68, s18, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s19, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s52, s18, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s19, s19 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s18, s18 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s11, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s53, s37, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s54, s36, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s37, s37 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s36, s36 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s39, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s38, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s39, s39 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s38, s38 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s41, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s40, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s41, s41 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s40, s40 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s42, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s60, s43 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s42, s42 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s45, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s44, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s45, s45 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s44, s44 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s47, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s64, s46, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s47, s47 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s46, s46 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s49, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s48, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s49, s49 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s48, s48 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s51, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s68, s50, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s51, s51 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s50, s50 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s43, 16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s50 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s51 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s49 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s46 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s47 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s44 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s45 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s42 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s59 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s43 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s41 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s57 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 @@ -3736,63 +3736,63 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s55 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s37 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s53 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s47 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -510,14 +510,14 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_load_v16i16: @@ -8003,33 +8003,33 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v3 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[6:7], 48 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[2:3], 48 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[4:5], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[0:1], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v1, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v7 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[2:3], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[6:7], 48 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[0:1], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[4:5], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v5, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) @@ -8055,48 +8055,48 @@ ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v9, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v2, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v10 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v7, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v8, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v7, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v9, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v12, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v14, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 @@ -8105,7 +8105,7 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -41,13 +41,13 @@ ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000 ; MUBUF-NEXT: v_add_u32_e32 v1, 0x20d0, v1 ; MUBUF-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen -; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 -; MUBUF-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen -; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3 +; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v5, vcc ; MUBUF-NEXT: v_mov_b32_e32 v2, 0 ; MUBUF-NEXT: s_waitcnt lgkmcnt(0) ; MUBUF-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -123,15 +123,15 @@ ; MUBUF-NEXT: v_add_u32_e32 v3, 0x1000, v3 ; MUBUF-NEXT: v_add_u32_e32 v3, 0x20d0, v3 ; MUBUF-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen -; MUBUF-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen offset:4 -; MUBUF-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen -; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen +; MUBUF-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x180000 ; MUBUF-NEXT: s_mov_b32 s33, s5 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v4, v5 +; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc ; MUBUF-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -10,11 +10,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s5, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -140,12 +140,12 @@ ; GFX9-LABEL: lshr_v_s_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -209,12 +209,12 @@ ; GFX9-LABEL: lshr_s_v_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s0 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -29,11 +29,11 @@ ; GFX9-LABEL: v_test_imax_sge_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -79,11 +79,11 @@ ; GFX9-LABEL: v_test_imax_sge_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -140,14 +140,17 @@ ; GFX9-LABEL: v_test_imax_sge_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_short_d16 v1, v0, s[0:1] offset:4 -; GFX9-NEXT: global_load_dword v3, v0, s[0:1] +; GFX9-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v4, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_pk_max_i16 v1, v2, v1 @@ -200,11 +203,11 @@ ; GFX9-LABEL: v_test_imax_sge_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v1, v3 ; GFX9-NEXT: v_pk_max_i16 v0, v0, v2 @@ -249,11 +252,11 @@ ; GFX9-LABEL: v_test_imax_sgt_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -297,11 +300,11 @@ ; GFX9-LABEL: v_test_umax_uge_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -345,11 +348,11 @@ ; GFX9-LABEL: v_test_umax_ugt_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -394,11 +397,11 @@ ; GFX9-LABEL: v_test_umax_ugt_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_u16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -495,8 +495,8 @@ ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: ; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} ; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} -; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] -; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; GCN: s_min_u32 [[MIN:s[0-9]+]], s{{[0-9]}}, s{{[0-9]}} +; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], s{{[0-9]}} ; GCN: buffer_store_dword [[VMIN]] ; EG: MIN_UINT diff --git a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll --- a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll +++ b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -enable-misched=0 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack -verify-machineinstrs -enable-misched=0 < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}sample_contig_nsa: ; GCN-DAG: image_sample_c_l v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], diff --git a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir --- a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: hazard_image_sample_d_buf_off6 # GCN: IMAGE_SAMPLE diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll b/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll --- a/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll @@ -47,10 +47,10 @@ ; GCN-LABEL: reassoc_v2i32: ; GCN: s_add_i32 [[ADD1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_add_i32 [[ADD2:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} -; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}} +; GCN-DAG: s_add_i32 [[ADD2:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} +; GFX8-DAG: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}} ; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD2]], v{{[0-9]+}} -; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}} +; GFX9-DAG: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}} ; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD2]], v{{[0-9]+}} define amdgpu_kernel void @reassoc_v2i32(<2 x i32> addrspace(1)* %arg, <2 x i32> %x, <2 x i32> %y) { bb: diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -58,19 +58,19 @@ ; GFX9-LABEL: saddo_i64_zext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_add_u32 s2, s6, s0 +; GFX9-NEXT: s_add_u32 s0, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s3, s7, s1 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: s_addc_u32 s1, s7, s3 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm @@ -132,14 +132,14 @@ ; GFX9-LABEL: s_saddo_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_add_i32 s1, s0, s1 -; GFX9-NEXT: v_add_i32 v1, s0, v1 clamp -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_add_i32 s0, s2, s3 +; GFX9-NEXT: v_add_i32 v1, s2, v1 clamp +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: global_store_byte v0, v1, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -223,13 +223,13 @@ define amdgpu_kernel void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind { ; SI-LABEL: scalar_to_vector_test6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: scalar_to_vector_test6: diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll --- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll @@ -8,15 +8,15 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s8, s[2:3], 0x0 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, 1, s8 +; GCN-NEXT: s_lshr_b32 s0, 1, s2 ; GCN-NEXT: s_ff1_i32_b32 s0, s0 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], s8, 0 +; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, 0 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[2:3] ; GCN-NEXT: v_ffbh_i32_e32 v1, v0 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll --- a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}const_load_no_shrink_dword_to_unaligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013 define amdgpu_kernel void @const_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) { @@ -14,6 +15,7 @@ } ; GCN-LABEL: const_load_no_shrink_dword_to_aligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003 define amdgpu_kernel void @const_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) { @@ -27,6 +29,7 @@ } ; GCN-LABEL: global_load_no_shrink_dword_to_unaligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013 define amdgpu_kernel void @global_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %x) { @@ -40,6 +43,7 @@ } ; GCN-LABEL: global_load_no_shrink_dword_to_aligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003 define amdgpu_kernel void @global_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %x) { diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,12 +8,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -140,12 +140,12 @@ ; GFX9-LABEL: shl_v_s_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, s0, v1 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, s2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -209,12 +209,12 @@ ; GFX9-LABEL: shl_s_v_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s0 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -150,8 +150,8 @@ ; GCN-LABEL: {{^}}smrd_load_const1: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff glc ; encoding: [0xff -; VIGFX9_10-DAG: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc ; -; VIGFX9_10-DAG: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc glc ; +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x3fc ; +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x3fc glc ; define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 @@ -173,8 +173,8 @@ ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x400 +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x400 define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 @@ -194,8 +194,8 @@ ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0xffffc +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0xffffc define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 @@ -211,8 +211,8 @@ ; SMRD load with an offset greater than the largest possible immediate on VI ; GCN-LABEL: {{^}}smrd_load_const4: ; SIVIGFX9_10: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 -; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] -; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]:[0-9]+}}], [[OFFSET]] +; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]:[0-9]+}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -54,7 +54,7 @@ ; GFX6: ScratchSize: 8608 ; FLATSCR: s_movk_i32 [[SOFF1:s[0-9]+]], 0x -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(4) ; FLATSCR-NEXT: scratch_store_dword off, v{{[0-9]+}}, [[SOFF1]] ; 4-byte Folded Spill ; FLATSCR: s_movk_i32 [[SOFF2:s[0-9]+]], 0x ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF2]] ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -137,7 +137,7 @@ ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; GCN: NumVgprs: 10 -; GFX900: ScratchSize: 44 +; GFX900: ScratchSize: 52 ; GFX908: ScratchSize: 20 ; GCN: VGPRBlocks: 2 ; GCN: NumVGPRsForWavesPerEU: 10 @@ -246,7 +246,7 @@ ; GFX908-DAG: v_accvgpr_read_b32 ; GCN: NumVgprs: 256 -; GFX900: ScratchSize: 2052 +; GFX900: ScratchSize: 1028 ; GFX908-FIXME: ScratchSize: 0 ; GCN: VGPRBlocks: 63 ; GCN: NumVGPRsForWavesPerEU: 256 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir @@ -308,12 +308,12 @@ ; VR: renamable $sgpr8 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr12, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr9 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr13, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr14 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr15, 0, 0 :: (dereferenceable invariant load 4) - ; VR: renamable $sgpr15 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr16, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr10_sgpr11 = IMPLICIT_DEF + ; VR: renamable $sgpr17 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr22, 0, 0 :: (dereferenceable invariant load 4) + ; VR: renamable $sgpr15 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr16, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr12 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr18, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr13 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr19, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr16 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr21, 0, 0 :: (dereferenceable invariant load 4) - ; VR: renamable $sgpr17 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr22, 0, 0 :: (dereferenceable invariant load 4) ; VR: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, implicit killed renamable $sgpr10_sgpr11, implicit killed renamable $sgpr8, implicit killed renamable $sgpr9, implicit killed renamable $sgpr12, implicit killed renamable $sgpr13, implicit killed renamable $sgpr14, implicit killed renamable $sgpr15, implicit killed renamable $sgpr16, implicit killed renamable $sgpr17 %0:sgpr_128 = IMPLICIT_DEF %1:sreg_64 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -10,56 +10,61 @@ ; CHECK: bb.0..expVert: ; CHECK: liveins: $sgpr3, $sgpr4, $sgpr5, $sgpr8, $sgpr9, $sgpr10, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr25, $sgpr27, $sgpr31 ; CHECK: undef %56.sub0:sgpr_64 = COPY $sgpr31 - ; CHECK: SI_SPILL_S32_SAVE $sgpr27, %stack.2, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) - ; CHECK: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr25 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; CHECK: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr18 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr27 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr25 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr18 ; CHECK: undef %50.sub0:sgpr_64 = COPY $sgpr19 - ; CHECK: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr20 - ; CHECK: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr21 - ; CHECK: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr22 - ; CHECK: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr23 - ; CHECK: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr9 - ; CHECK: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr10 - ; CHECK: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr8 + ; CHECK: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr20 + ; CHECK: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr21 + ; CHECK: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr22 + ; CHECK: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr23 + ; CHECK: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr9 + ; CHECK: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr10 + ; CHECK: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr8 ; CHECK: undef %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %56, 232, 0, 0 :: (load 8 from %ir.40, addrspace 4) ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; CHECK: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc - ; CHECK: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc - ; CHECK: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc ; CHECK: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_]], 31, implicit-def dead $scc ; CHECK: [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc - ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc - ; CHECK: SI_SPILL_S32_SAVE [[S_AND_B32_]], %stack.0, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.0, addrspace 5) + ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc ; CHECK: [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc - ; CHECK: undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY4]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK: undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %130.sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0, 0 :: (load 16 from %ir.84, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0, 0 :: (load 16 from `<4 x i32> addrspace(4)* undef`, addrspace 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %132:sgpr_128, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: early-clobber %73:sgpr_128, early-clobber %143:sgpr_128, early-clobber %131:sreg_32_xm0_xexec = BUNDLE %130, undef %132:sgpr_128, undef %74:sreg_64 { + ; CHECK: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0, 0 :: (load 16 from %ir.84, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0, 0 :: (load 16 from `<4 x i32> addrspace(4)* undef`, addrspace 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %132:sgpr_128, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: } ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK: %71.sub3:sgpr_128 = S_MOV_B32 553734060 ; CHECK: %71.sub2:sgpr_128 = S_MOV_B32 -1 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: SI_SPILL_S128_SAVE %71, %stack.1, implicit $exec, implicit $sgpr32 :: (store 16 into %stack.1, align 4, addrspace 5) + ; CHECK: early-clobber %87:vgpr_32, early-clobber %117:vgpr_32, early-clobber %76:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM1]], undef %118:sgpr_128, undef %89:sgpr_128, [[V_MOV_B32_e32_]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: } + ; CHECK: [[COPY13:%[0-9]+]]:sgpr_128 = COPY %71 ; CHECK: %71.sub1:sgpr_128 = S_MOV_B32 0 ; CHECK: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 29, implicit-def dead $scc ; CHECK: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc ; CHECK: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc - ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY4]], 64, implicit-def $scc + ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY5]], 64, implicit-def $scc ; CHECK: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %54:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK: undef %149.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %149.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc ; CHECK: undef %156.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK: %156.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc ; CHECK: undef %163.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0, 0 :: (load 16 from %ir.91, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0, 0 :: (load 16 from %ir.97, addrspace 4) + ; CHECK: early-clobber %150:sgpr_128, early-clobber %157:sgpr_128 = BUNDLE %149, %156 { + ; CHECK: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0, 0 :: (load 16 from %ir.91, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0, 0 :: (load 16 from %ir.97, addrspace 4) + ; CHECK: } ; CHECK: %163.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %171:sreg_32, 31, implicit-def dead $scc ; CHECK: undef %176.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], undef %171:sreg_32, implicit-def $scc @@ -87,55 +92,67 @@ ; CHECK: %253.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: undef %261.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], undef %171:sreg_32, implicit-def $scc ; CHECK: %261.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %273.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK: undef %273.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %273.sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %286.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK: undef %286.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK: %286.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %293.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK: undef %293.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %293.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc ; CHECK: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, undef %314:sreg_32, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 16, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %369:sgpr_128, undef %370:sreg_32, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %380:sgpr_128, 16, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0, 0 :: (load 16 from %ir.111, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0, 0 :: (load 16 from %ir.117, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0, 0 :: (load 16 from %ir.123, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0, 0 :: (load 16 from %ir.131, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0, 0 :: (load 16 from %ir.138, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %364:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %375:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: early-clobber %379:sreg_32_xm0_xexec, early-clobber %201:sgpr_128, early-clobber %177:sgpr_128, early-clobber %184:sgpr_128, early-clobber %319:sreg_32_xm0_xexec, early-clobber %191:sgpr_128, early-clobber %309:sreg_32_xm0_xexec, early-clobber %323:sreg_32_xm0_xexec, early-clobber %368:sreg_32_xm0_xexec, early-clobber %313:sreg_32_xm0_xexec, early-clobber %211:sgpr_128 = BUNDLE [[S_ADD_I32_]], %71, undef %369:sgpr_128, %210, undef %314:sreg_32, %200, undef %380:sgpr_128, %176, %183, [[S_ADD_I32_1]], %190, undef %370:sreg_32 { + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, undef %314:sreg_32, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 16, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %369:sgpr_128, undef %370:sreg_32, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %380:sgpr_128, 16, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0, 0 :: (load 16 from %ir.111, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0, 0 :: (load 16 from %ir.117, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0, 0 :: (load 16 from %ir.123, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0, 0 :: (load 16 from %ir.131, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0, 0 :: (load 16 from %ir.138, addrspace 4) + ; CHECK: } + ; CHECK: early-clobber %151:vgpr_32, early-clobber %158:vgpr_32, early-clobber %165:vgpr_32 = BUNDLE [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], [[S_LOAD_DWORDX4_IMM3]], [[S_LOAD_DWORDX4_IMM4]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: } + ; CHECK: early-clobber %374:sreg_32_xm0_xexec, early-clobber %363:sreg_32_xm0_xexec = BUNDLE [[S_ADD_I32_]], undef %364:sgpr_128, undef %375:sgpr_128, [[S_ADD_I32_1]] { + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %364:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %375:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: } ; CHECK: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR]], -98, implicit-def dead $scc ; CHECK: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR1]], -114, implicit-def dead $scc ; CHECK: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR2]], -130, implicit-def dead $scc ; CHECK: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc - ; CHECK: undef %327.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK: undef %327.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %327.sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %335.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK: undef %335.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %335.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %343.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK: undef %343.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK: %343.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %351.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK: undef %351.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %351.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY9]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY10]], 4, implicit-def dead $scc ; CHECK: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %396:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0, 0 :: (load 16 from %ir.155, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0, 0 :: (load 16 from %ir.144, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0, 0 :: (load 16 from %ir.150, addrspace 4) + ; CHECK: early-clobber %218:sgpr_128, early-clobber %225:sgpr_128, early-clobber %231:sgpr_128 = BUNDLE %217, %224, %50 { + ; CHECK: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0, 0 :: (load 16 from %ir.155, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0, 0 :: (load 16 from %ir.144, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0, 0 :: (load 16 from %ir.150, addrspace 4) + ; CHECK: } ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0, 0 :: (load 16 from %ir.162, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0, 0 :: (load 16 from %ir.170, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: early-clobber %254:sgpr_128, early-clobber %242:sgpr_128 = BUNDLE %253, %241 { + ; CHECK: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0, 0 :: (load 16 from %ir.162, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0, 0 :: (load 16 from %ir.170, addrspace 4) + ; CHECK: } + ; CHECK: early-clobber %212:vgpr_32, early-clobber %202:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM8]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: } ; CHECK: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -217, implicit-def dead $scc ; CHECK: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -233, implicit-def dead $scc ; CHECK: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR5]], -249, implicit-def dead $scc @@ -144,35 +161,41 @@ ; CHECK: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -329, implicit-def dead $scc ; CHECK: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -345, implicit-def dead $scc ; CHECK: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR6]], -441, implicit-def dead $scc - ; CHECK: [[S_ADD_U32_3:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 160, implicit-def $scc + ; CHECK: [[S_ADD_U32_3:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], 160, implicit-def $scc ; CHECK: [[S_ADDC_U32_3:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %36:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK: undef %411.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %411.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY10]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY11]], 4, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc ; CHECK: undef %425.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_4]], implicit-def $scc ; CHECK: %425.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_ADD_U32_4:%[0-9]+]]:sreg_32 = S_ADD_U32 %56.sub0, 168, implicit-def $scc ; CHECK: [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %57:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc ; CHECK: undef %441.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_5]], implicit-def $scc ; CHECK: %441.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %441, 0, 0, 0 :: (load 4 from %ir..i085.i, align 8, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0, 0 :: (load 16 from %ir.176, addrspace 4) + ; CHECK: early-clobber %71.sub0:sgpr_128, early-clobber %262:sgpr_128 = BUNDLE %261, %441 { + ; CHECK: internal %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %441, 0, 0, 0 :: (load 4 from %ir..i085.i, align 8, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0, 0 :: (load 16 from %ir.176, addrspace 4) + ; CHECK: } ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %273, 0, 0, 0 :: (load 16 from %ir.185, addrspace 4) ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0, 0 :: (load 16 from %ir.194, addrspace 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: early-clobber %445:sreg_32_xm0_xexec, early-clobber %287:sgpr_128 = BUNDLE %71, %286 { + ; CHECK: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0, 0 :: (load 16 from %ir.194, addrspace 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: } ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %293, 0, 0, 0 :: (load 16 from %ir.200, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc + ; CHECK: early-clobber %281:vgpr_32, early-clobber %275:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM1]], [[S_LOAD_DWORDX4_IMM16]], [[V_MOV_B32_e32_]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: } + ; CHECK: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc ; CHECK: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc @@ -180,16 +203,20 @@ ; CHECK: %453.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc ; CHECK: %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %453, 0, 0, 0 :: (load 8 from %ir.304, addrspace 4) ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0, 0 :: (load 16 from %ir.223, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0, 0 :: (load 16 from %ir.230, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0, 0 :: (load 16 from %ir.236, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0, 0 :: (load 16 from %ir.242, addrspace 4) + ; CHECK: early-clobber %336:sgpr_128, early-clobber %352:sgpr_128, early-clobber %328:sgpr_128, early-clobber %344:sgpr_128 = BUNDLE %327, %343, %335, %351 { + ; CHECK: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0, 0 :: (load 16 from %ir.223, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0, 0 :: (load 16 from %ir.230, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0, 0 :: (load 16 from %ir.236, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0, 0 :: (load 16 from %ir.242, addrspace 4) + ; CHECK: } ; CHECK: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 3, implicit-def dead $scc + ; CHECK: early-clobber %329:vgpr_32, early-clobber %345:vgpr_32, early-clobber %337:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM20]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], [[S_LOAD_DWORDX4_IMM21]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: } + ; CHECK: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc ; CHECK: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc @@ -197,18 +224,22 @@ ; CHECK: %468.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc ; CHECK: %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %468, 0, 0, 0 :: (load 8 from %ir.316, addrspace 4) ; CHECK: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0, 0 :: (load 16 from %ir.278, addrspace 4) - ; CHECK: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0, 0 :: (load 4 from `i32 addrspace(4)* undef`, addrspace 4) - ; CHECK: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY11]], 3, implicit-def dead $scc + ; CHECK: early-clobber %412:sgpr_128, early-clobber %487:sreg_32_xm0_xexec, early-clobber %475:sreg_32_xm0_xexec = BUNDLE %71, undef %488:sreg_64, %411 { + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0, 0 :: (load 16 from %ir.278, addrspace 4) + ; CHECK: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0, 0 :: (load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; CHECK: } + ; CHECK: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 3, implicit-def dead $scc ; CHECK: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %425, 0, 0, 0 :: (load 16 from %ir.287, addrspace 4) ; CHECK: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc ; CHECK: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc ; CHECK: undef %485.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_8]], implicit-def $scc ; CHECK: %485.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc ; CHECK: %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %485, 0, 0, 0 :: (load 4 from %ir..i0100.i, align 8, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: early-clobber %413:vgpr_32, early-clobber %427:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM23]], [[S_LOAD_DWORDX4_IMM24]], [[V_MOV_B32_e32_]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: } ; CHECK: %71.sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_]], implicit-def dead $scc ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc @@ -217,8 +248,7 @@ ; CHECK: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -507, implicit-def dead $scc ; CHECK: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -539, implicit-def dead $scc ; CHECK: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc - ; CHECK: [[SI_SPILL_S32_RESTORE:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) - ; CHECK: [[S_ADD_U32_5:%[0-9]+]]:sreg_32 = S_ADD_U32 [[SI_SPILL_S32_RESTORE]], 96, implicit-def $scc + ; CHECK: [[S_ADD_U32_5:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 96, implicit-def $scc ; CHECK: [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %33:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK: undef %514.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %514.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc @@ -229,9 +259,11 @@ ; CHECK: undef %530.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %530.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %530, 0, 0, 0 :: (load 16 from %ir.359, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: early-clobber %516:vgpr_32, early-clobber %532:vgpr_32, early-clobber %524:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM26]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], [[S_LOAD_DWORDX4_IMM27]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: } ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec ; CHECK: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], implicit $exec ; CHECK: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec @@ -336,13 +368,8 @@ ; CHECK: [[V_OR_B32_e32_61:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_OR_B32_e32_60]], [[V_ADD_U32_e32_25]], implicit $exec ; CHECK: [[V_ADD_U32_e32_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -575, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec ; CHECK: [[V_OR_B32_e32_62:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_OR_B32_e32_61]], [[V_ADD_U32_e32_26]], implicit $exec - ; CHECK: [[SI_SPILL_S32_RESTORE1:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.0, addrspace 5) - ; CHECK: [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load 16 from %stack.1, align 4, addrspace 5) - ; CHECK: undef %914.sub2_sub3:sgpr_128 = COPY [[SI_SPILL_S128_RESTORE]].sub2_sub3 { - ; CHECK: internal %914.sub0:sgpr_128 = COPY [[SI_SPILL_S128_RESTORE]].sub0 - ; CHECK: } - ; CHECK: %914.sub1:sgpr_128 = COPY [[SI_SPILL_S32_RESTORE1]] - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %914, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[COPY13]].sub1:sgpr_128 = COPY [[S_AND_B32_]] + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY13]], 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[V_ADD_U32_e32_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -576, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec ; CHECK: [[V_OR_B32_e32_63:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_OR_B32_e32_62]], [[V_ADD_U32_e32_27]], implicit $exec ; CHECK: [[V_ADD_U32_e32_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -577, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll b/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll --- a/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll +++ b/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll @@ -1,10 +1,8 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx902 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx904 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NO-ECC %s +; RUN: llc -march=amdgcn -mcpu=gfx904 -mattr=+sramecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=+sramecc < %s | FileCheck -check-prefixes=GCN,ECC %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-sramecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s ; Make sure the correct set of targets are marked with ; FeatureDoesNotSupportSRAMECC, and +sram-ecc is ignored if it's never diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll @@ -0,0 +1,13 @@ +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s + +; REQUIRES: asserts + +; NOT-SUPPORTED: sramecc setting for subtarget: Unsupported +; ANY: sramecc setting for subtarget: Any +define void @sramecc-subtarget-feature-default() #0 { + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll @@ -0,0 +1,14 @@ +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s + +; REQUIRES: asserts + +; WARN: warning: sramecc 'Off' was requested for a processor that does not support it! +; OFF: sramecc setting for subtarget: Off + +define void @sramecc-subtarget-feature-disabled() #0 { + ret void +} + +attributes #0 = { "target-features"="-sramecc" } diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll @@ -0,0 +1,13 @@ +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s + +; REQUIRES: asserts + +; WARN: warning: sramecc 'On' was requested for a processor that does not support it! +; ON: sramecc setting for subtarget: On +define void @sramecc-subtarget-feature-enabled() #0 { + ret void +} + +attributes #0 = { "target-features"="+sramecc" } diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; @@ -51,43 +51,43 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 ; GFX9-NEXT: ds_write_b8 v0, v2 offset:8 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s4, s3, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_lshr_b32 s0, s7, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s7, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:13 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:15 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s6, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s2, s1, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 @@ -205,20 +205,20 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: ds_write_b16 v0, v1 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm @@ -287,15 +287,15 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3 ; GFX9-NEXT: s_endpgm ; @@ -335,14 +335,14 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; @@ -380,14 +380,14 @@ define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -6,13 +6,13 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; @@ -49,34 +49,34 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 ; GFX9-NEXT: ds_write_b8 v0, v2 offset:4 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s6, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s2, s1, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 @@ -172,17 +172,17 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: ds_write_b16 v0, v1 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm @@ -241,13 +241,13 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; GFX9-NEXT: ds_write_b32 v0, v3 offset:8 ; GFX9-NEXT: s_endpgm @@ -286,13 +286,13 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm @@ -331,13 +331,13 @@ define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -14,10 +14,10 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v0, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -123,11 +123,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -380,15 +380,15 @@ ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -432,19 +432,19 @@ ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v0 +; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -488,15 +488,15 @@ ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -542,15 +542,15 @@ ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v0 +; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -18,11 +18,12 @@ ; GFX9-LABEL: shuffle_v4f16_234u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -74,11 +75,12 @@ ; GFX9-LABEL: shuffle_v4f16_3u6u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -90,11 +92,12 @@ ; GFX9-LABEL: shuffle_v4f16_3uu7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -106,15 +109,15 @@ ; GFX9-LABEL: shuffle_v4f16_35u5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -126,14 +129,14 @@ ; GFX9-LABEL: shuffle_v4f16_357u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -173,9 +176,12 @@ ; GFX9-LABEL: shuffle_v4f16_0145: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -187,9 +193,12 @@ ; GFX9-LABEL: shuffle_v4f16_0167: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -229,9 +238,12 @@ ; GFX9-LABEL: shuffle_v4f16_2345: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -243,9 +255,12 @@ ; GFX9-LABEL: shuffle_v4f16_2367: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -257,11 +272,12 @@ ; GFX9-LABEL: shuffle_v4f16_4501: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -273,11 +289,12 @@ ; GFX9-LABEL: shuffle_v4f16_4523: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -316,11 +333,12 @@ ; GFX9-LABEL: shuffle_v4f16_6701: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -332,11 +350,12 @@ ; GFX9-LABEL: shuffle_v4f16_6723: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -376,13 +395,14 @@ ; GFX9-LABEL: shuffle_v4f16_2356: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -394,13 +414,14 @@ ; GFX9-LABEL: shuffle_v4f16_5623: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -412,14 +433,15 @@ ; GFX9-LABEL: shuffle_v4f16_3456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -431,15 +453,14 @@ ; GFX9-LABEL: shuffle_v4f16_5634: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -451,16 +472,16 @@ ; GFX9-LABEL: shuffle_v4f16_5734: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -472,13 +493,14 @@ ; GFX9-LABEL: shuffle_v4i16_2356: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 @@ -490,9 +512,12 @@ ; GFX9-LABEL: shuffle_v4i16_0167: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 @@ -556,12 +581,12 @@ ; GFX9-LABEL: shuffle_v4f16_6161: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -634,9 +659,12 @@ ; GFX9-LABEL: shuffle_v8f16_4589: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:8 -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:8 +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -648,11 +676,12 @@ ; GFX9-LABEL: shuffle_v8f16_10_11_2_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -664,13 +693,14 @@ ; GFX9-LABEL: shuffle_v8f16_13_14_2_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -713,14 +743,16 @@ ; GFX9-LABEL: shuffle_v6f16_452367: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-NEXT: global_load_dword v3, v[3:4], off +; GFX9-NEXT: global_load_dwordx3 v[0:2], v[5:6], off +; GFX9-NEXT: global_load_dword v7, v[3:4], off ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0 %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1 @@ -732,18 +764,18 @@ ; GFX9-LABEL: fma_shuffle: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[4:5] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm entry: %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -778,15 +810,15 @@ ; GFX9-LABEL: shuffle_v4f16_0456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v1, v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v7, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -10,21 +10,21 @@ ; GCN-NEXT: BB0_1: ; %bb0 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_2 Depth 2 -; GCN-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 +; GCN-NEXT: v_add_co_u32_e64 v6, vcc_lo, v0, 8 ; GCN-NEXT: s_mov_b32 s5, exec_lo -; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GCN-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] +; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: BB0_2: ; Parent Loop BB0_1 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s8, v4 -; GCN-NEXT: v_readfirstlane_b32 s9, v5 -; GCN-NEXT: v_readfirstlane_b32 s10, v2 -; GCN-NEXT: v_readfirstlane_b32 s11, v3 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5] -; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GCN-NEXT: v_readfirstlane_b32 s8, v2 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NEXT: v_readfirstlane_b32 s11, v5 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[2:3] +; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[4:5] ; GCN-NEXT: s_and_b32 s4, vcc_lo, s4 ; GCN-NEXT: s_and_saveexec_b32 s4, s4 ; GCN-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -march=amdgcn -mcpu=gfx902 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s + +; REQUIRES: asserts + +; NOT-SUPPORTED: xnack setting for subtarget: Unsupported +; ANY: xnack setting for subtarget: Any +define void @xnack-subtarget-feature-any() #0 { + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll @@ -0,0 +1,17 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s + +; REQUIRES: asserts + +; WARN: warning: xnack 'Off' was requested for a processor that does not support it! +; OFF: xnack setting for subtarget: Off + +define void @xnack-subtarget-feature-disabled() #0 { + ret void +} + +attributes #0 = { "target-features"="-xnack" } diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s + +; REQUIRES: asserts + +; WARN: warning: xnack 'On' was requested for a processor that does not support it! +; ON: xnack setting for subtarget: On +define void @xnack-subtarget-feature-enabled() #0 { + ret void +} + +attributes #0 = { "target-features"="+xnack" } diff --git a/llvm/test/MC/AMDGPU/xnack-mask.s b/llvm/test/MC/AMDGPU/xnack-mask.s --- a/llvm/test/MC/AMDGPU/xnack-mask.s +++ b/llvm/test/MC/AMDGPU/xnack-mask.s @@ -1,10 +1,10 @@ // RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 --implicit-check-not=error: %s // RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 --implicit-check-not=error: %s // RUN: not llvm-mc -arch=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 --implicit-check-not=error: %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1001 %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1001 -mattr=-xnack %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 --implicit-check-not=error: %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney %s 2>&1 | FileCheck -check-prefix=XNACKERR --implicit-check-not=error: %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -show-encoding %s | FileCheck -check-prefix=XNACK %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -mattr=+xnack %s 2>&1 | FileCheck -check-prefix=XNACKERR --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -mattr=+xnack -show-encoding %s | FileCheck -check-prefix=XNACK %s s_mov_b64 xnack_mask, -1 // NOSICIVI10: error: register not available on this GPU