diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -137,10 +137,10 @@ "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" >; -def FeatureDoesNotSupportXNACK : SubtargetFeature<"no-xnack-support", - "DoesNotSupportXNACK", +def FeatureSupportsXNACK : SubtargetFeature<"xnack-support", + "SupportsXNACK", "true", - "Hardware does not support XNACK" + "Hardware supports XNACK" >; // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support @@ -499,16 +499,16 @@ [FeatureFlatGlobalInsts] >; -def FeatureDoesNotSupportSRAMECC : SubtargetFeature<"no-sram-ecc-support", - "DoesNotSupportSRAMECC", +def FeatureSupportsSRAMECC : SubtargetFeature<"sramecc-support", + "SupportsSRAMECC", "true", - "Hardware does not support SRAM ECC" + "Hardware supports SRAMECC" >; -def FeatureSRAMECC : SubtargetFeature<"sram-ecc", +def FeatureSRAMECC : SubtargetFeature<"sramecc", "EnableSRAMECC", "true", - "Enable SRAM ECC" + "Enable SRAMECC" >; def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx", @@ -674,8 +674,7 @@ [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel, - FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC, - FeatureDoesNotSupportXNACK] + FeatureTrigReducedRange] >; def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", @@ -684,7 +683,7 @@ FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, - FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC] + FeatureDsSrc2Insts] >; def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", @@ -697,8 +696,7 @@ FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP, FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, - FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC, FeatureFastDenormalF32 - ] + FeatureDsSrc2Insts, FeatureFastDenormalF32] >; def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", @@ -714,8 +712,7 @@ FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, - FeatureFastDenormalF32, FeatureUnalignedDSAccess - ] + FeatureFastDenormalF32, FeatureUnalignedDSAccess, FeatureSupportsXNACK] >; def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", @@ -730,11 +727,9 @@ FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts, FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, - FeatureVOP3Literal, FeatureDPP8, - FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC, - FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedDSAccess - ] + FeatureVOP3Literal, FeatureDPP8, FeatureNoDataDepHazard, + FeaturePkFmacF16Inst, FeatureGFX10A16, FeatureFastDenormalF32, + FeatureG16, FeatureUnalignedDSAccess] >; class FeatureSet Features_> { @@ -744,84 +739,72 @@ def FeatureISAVersion6_0_0 : FeatureSet<[FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops, - FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount32]>; def FeatureISAVersion6_0_1 : FeatureSet< [FeatureSouthernIslands, - FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount32]>; def FeatureISAVersion6_0_2 : FeatureSet< [FeatureSouthernIslands, - FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount32]>; def FeatureISAVersion7_0_0 : FeatureSet< [FeatureSeaIslands, - FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount32]>; def FeatureISAVersion7_0_1 : FeatureSet< [FeatureSeaIslands, HalfRate64Ops, FeatureLDSBankCount32, - FeatureFastFMAF32, - FeatureDoesNotSupportXNACK]>; + FeatureFastFMAF32]>; def FeatureISAVersion7_0_2 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount16, - FeatureFastFMAF32, - FeatureDoesNotSupportXNACK]>; + FeatureFastFMAF32]>; def FeatureISAVersion7_0_3 : FeatureSet< [FeatureSeaIslands, - FeatureLDSBankCount16, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount16]>; def FeatureISAVersion7_0_4 : FeatureSet< [FeatureSeaIslands, - FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount32]>; def FeatureISAVersion7_0_5 : FeatureSet< [FeatureSeaIslands, - FeatureLDSBankCount16, - FeatureDoesNotSupportXNACK]>; + FeatureLDSBankCount16]>; def FeatureISAVersion8_0_1 : FeatureSet< [FeatureVolcanicIslands, FeatureFastFMAF32, HalfRate64Ops, FeatureLDSBankCount32, - FeatureXNACK, + FeatureSupportsXNACK, FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_2 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount32, FeatureSGPRInitBug, - FeatureUnpackedD16VMem, - FeatureDoesNotSupportXNACK]>; + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_3 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount32, - FeatureUnpackedD16VMem, - FeatureDoesNotSupportXNACK]>; + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_5 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount32, FeatureSGPRInitBug, - FeatureUnpackedD16VMem, - FeatureDoesNotSupportXNACK]>; + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_1_0 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount16, - FeatureXNACK, + FeatureSupportsXNACK, FeatureImageStoreD16Bug, FeatureImageGather4D16Bug]>; @@ -829,24 +812,18 @@ [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK, - FeatureDoesNotSupportSRAMECC, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_2 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureXNACK, - FeatureDoesNotSupportSRAMECC, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_4 : FeatureSet< [FeatureGFX9, FeatureLDSBankCount32, FeatureFmaMixInsts, - FeatureDoesNotSupportXNACK, - FeatureDoesNotSupportSRAMECC, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_6 : FeatureSet< @@ -857,7 +834,7 @@ FeatureDLInsts, FeatureDot1Insts, FeatureDot2Insts, - FeatureDoesNotSupportXNACK, + FeatureSupportsSRAMECC, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_8 : FeatureSet< @@ -875,15 +852,14 @@ FeatureMAIInsts, FeaturePkFmacF16Inst, FeatureAtomicFaddInsts, - FeatureSRAMECC, FeatureMFMAInlineLiteralBug, + FeatureSupportsSRAMECC, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_9 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureXNACK, FeatureImageGather4D16Bug]>; // TODO: Organize more features into groups. @@ -917,7 +893,7 @@ FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureDoesNotSupportXNACK])>; + FeatureSupportsXNACK])>; def FeatureISAVersion10_1_1 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -938,7 +914,7 @@ FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureDoesNotSupportXNACK])>; + FeatureSupportsXNACK])>; def FeatureISAVersion10_1_2 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -959,7 +935,7 @@ FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureDoesNotSupportXNACK])>; + FeatureSupportsXNACK])>; def FeatureISAVersion10_3_0 : FeatureSet< [FeatureGFX10, @@ -972,8 +948,7 @@ FeatureDot5Insts, FeatureDot6Insts, FeatureNSAEncoding, - FeatureWavefrontSize32, - FeatureDoesNotSupportXNACK]>; + FeatureWavefrontSize32]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -300,6 +300,8 @@ std::unique_ptr Legalizer; std::unique_ptr RegBankInfo; + Optional TargetID; + protected: // Basic subtarget description. Triple TargetTriple; @@ -320,8 +322,12 @@ bool UnalignedBufferAccess; bool UnalignedAccessMode; bool HasApertureRegs; + bool SupportsXNACK; + + // This should not be used directly. 'TargetID' tracks the dynamic settings + // for XNACK. bool EnableXNACK; - bool DoesNotSupportXNACK; + bool EnableCuMode; bool TrapHandler; @@ -375,8 +381,12 @@ bool HasMAIInsts; bool HasPkFmacF16Inst; bool HasAtomicFaddInsts; + bool SupportsSRAMECC; + + // This should not be used directly. 'TargetID' tracks the dynamic settings + // for SRAMECC. bool EnableSRAMECC; - bool DoesNotSupportSRAMECC; + bool HasNoSdstCMPX; bool HasVscnt; bool HasGetWaveIdInst; @@ -469,6 +479,11 @@ return RegBankInfo.get(); } + const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { + assert(TargetID.hasValue() && "TargetID has not be initialized"); + return *TargetID; + } + // Nothing implemented, just prevent crashes on use. const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { return &TSInfo; @@ -723,7 +738,7 @@ } bool isXNACKEnabled() const { - return EnableXNACK; + return getTargetID().isXnackOnOrAny(); } bool isCuModeEnabled() const { @@ -786,7 +801,7 @@ } bool d16PreservesUnusedBits() const { - return hasD16LoadStore() && !isSRAMECCEnabled(); + return hasD16LoadStore() && !getTargetID().isSramEccOnOrAny(); } bool hasD16Images() const { @@ -894,10 +909,6 @@ return HasAtomicFaddInsts; } - bool isSRAMECCEnabled() const { - return EnableSRAMECC; - } - bool hasNoSdstCMPX() const { return HasNoSdstCMPX; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -13,18 +13,19 @@ #include "AMDGPUSubtarget.h" #include "AMDGPU.h" -#include "AMDGPUTargetMachine.h" #include "AMDGPUCallLowering.h" #include "AMDGPUInstructionSelector.h" #include "AMDGPULegalizerInfo.h" #include "AMDGPURegisterBankInfo.h" -#include "SIMachineFunctionInfo.h" +#include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineScheduler.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/IR/MDBuilder.h" #include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/MC/MCSubtargetInfo.h" #include using namespace llvm; @@ -76,9 +77,9 @@ // // Similarly we want enable-prt-strict-null to be on by default and not to // unset everything else if it is disabled + TargetID.emplace(*this); - // Assuming ECC is enabled is the conservative default. - SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); + SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; @@ -131,20 +132,12 @@ HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; - // Disable XNACK on targets where it is not enabled by default unless it is - // explicitly requested. - if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { - ToggleFeature(AMDGPU::FeatureXNACK); - EnableXNACK = false; - } + TargetID->setTargetIDFromFeaturesString(FS); - // ECC is on by default, but turn it off if the hardware doesn't support it - // anyway. This matters for the gfx9 targets with d16 loads, but don't support - // ECC. - if (DoesNotSupportSRAMECC && EnableSRAMECC) { - ToggleFeature(AMDGPU::FeatureSRAMECC); - EnableSRAMECC = false; - } + LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " + << TargetID->getXnackSetting() << '\n'); + LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " + << TargetID->getSramEccSetting() << '\n'); return *this; } @@ -189,8 +182,8 @@ UnalignedAccessMode(false), HasApertureRegs(false), + SupportsXNACK(false), EnableXNACK(false), - DoesNotSupportXNACK(false), EnableCuMode(false), TrapHandler(false), @@ -239,8 +232,8 @@ HasMAIInsts(false), HasPkFmacF16Inst(false), HasAtomicFaddInsts(false), + SupportsSRAMECC(false), EnableSRAMECC(false), - DoesNotSupportSRAMECC(false), HasNoSdstCMPX(false), HasVscnt(false), HasGetWaveIdInst(false), diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -19,6 +19,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetParser.h" +#include "llvm/Support/raw_ostream.h" #include #include #include @@ -70,6 +71,43 @@ TRAP_NUM_SGPRS = 16 }; +enum class TargetIDSetting { + Unsupported, + Any, + Off, + On +}; + +class AMDGPUTargetID { +private: + TargetIDSetting XnackSetting; + TargetIDSetting SramEccSetting; + +public: + explicit AMDGPUTargetID(const MCSubtargetInfo &STI); + ~AMDGPUTargetID() = default; + + /// Returns the current xnack TargetIDSetting, possible options are + /// "Unsupported", "Any", "Off", and "On". + TargetIDSetting getXnackSetting() const; + + /// Returns the current sramecc TargetIDSetting, possible options are + /// "Unsupported", "Any", "Off", and "On". + TargetIDSetting getSramEccSetting() const; + + /// Returns true if the current xnack setting is "On" or "Any". + bool isXnackOnOrAny() const; + + /// Returns true if the current sramecc setting is "On" or "Any". + bool isSramEccOnOrAny() const; + + bool isXnackSupported() const; + bool isSramEccSupported() const; + + void setTargetIDFromFeaturesString(StringRef FS); + void setTargetIDFromTargetIDStream(StringRef TargetID); +}; + /// Streams isa version string for given subtarget \p STI into \p Stream. void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream); @@ -844,6 +882,10 @@ }; } // end namespace AMDGPU + +raw_ostream &operator<<(raw_ostream &OS, + const AMDGPU::IsaInfo::TargetIDSetting S); + } // end namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -267,6 +267,119 @@ namespace IsaInfo { +AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI) + : XnackSetting(TargetIDSetting::Any), SramEccSetting(TargetIDSetting::Any) { + if (!STI.getFeatureBits().test(FeatureSupportsXNACK)) + XnackSetting = TargetIDSetting::Unsupported; + if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC)) + SramEccSetting = TargetIDSetting::Unsupported; +} + +TargetIDSetting AMDGPUTargetID::getXnackSetting() const { + return XnackSetting; +} + +TargetIDSetting AMDGPUTargetID::getSramEccSetting() const { + return SramEccSetting; +} + +bool AMDGPUTargetID::isXnackOnOrAny() const { + return XnackSetting == TargetIDSetting::On || + XnackSetting == TargetIDSetting::Any; +} + +bool AMDGPUTargetID::isSramEccOnOrAny() const { + return SramEccSetting == TargetIDSetting::On || + SramEccSetting == TargetIDSetting::Any; +} + +bool AMDGPUTargetID::isXnackSupported() const { + return XnackSetting != TargetIDSetting::Unsupported; +} +bool AMDGPUTargetID::isSramEccSupported() const { + return SramEccSetting != TargetIDSetting::Unsupported; +} + +void AMDGPUTargetID::setTargetIDFromFeaturesString(StringRef FS) { + // Check if xnack or sramecc is explicitly enabled or disabled. In the + // absence of the target features we assume we must generate code that can run + // in any environment. + SubtargetFeatures Features(FS); + Optional XnackRequested; + Optional SramEccRequested; + + for (const std::string &Feature : Features.getFeatures()) { + if (Feature == "+xnack") + XnackRequested = true; + else if (Feature == "-xnack") + XnackRequested = false; + else if (Feature == "+sramecc") + SramEccRequested = true; + else if (Feature == "-sramecc") + SramEccRequested = false; + } + + bool XnackSupported = isXnackSupported(); + bool SramEccSupported = isSramEccSupported(); + + if (XnackRequested.hasValue()) { + if (XnackSupported) { + XnackSetting = + *XnackRequested ? TargetIDSetting::On : TargetIDSetting::Off; + } else { + // If a specific xnack setting was requested and this GPU does not support + // xnack emit a warning. Setting will remain set to "Unsupported". + if (*XnackRequested) { + errs() << "warning: xnack 'On' was requested for a processor that does " + "not support it!\n"; + } else { + errs() << "warning: xnack 'Off' was requested for a processor that " + "does not support it!\n"; + } + } + } + + if (SramEccRequested.hasValue()) { + if (SramEccSupported) { + SramEccSetting = + *SramEccRequested ? TargetIDSetting::On : TargetIDSetting::Off; + } else { + // If a specific sramecc setting was requested and this GPU does not + // support sramecc emit a warning. Setting will remain set to + // "Unsupported". + if (*SramEccRequested) { + errs() << "warning: sramecc 'On' was requested for a processor that " + "does not support it!\n"; + } else { + errs() << "warning: sramecc 'Off' was requested for a processor that " + "does not support it!\n"; + } + } + } +} + +static TargetIDSetting +getTargetIDSettingFromFeatureString(StringRef FeatureString) { + if (FeatureString.endswith("-")) + return TargetIDSetting::Off; + if (FeatureString.endswith("+")) + return TargetIDSetting::On; + + llvm_unreachable("Malformed feature string"); +} + +void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) { + SmallVector TargetIDSplit; + TargetID.split(TargetIDSplit, ':'); + + for (const auto &FeatureString : TargetIDSplit) { + if (FeatureString.startswith("xnack")) + XnackSetting = getTargetIDSettingFromFeatureString(FeatureString); + if (FeatureString.startswith("sramecc")) + SramEccSetting = getTargetIDSettingFromFeatureString(FeatureString); + } +} + void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) { auto TargetTriple = STI->getTargetTriple(); auto Version = getIsaVersion(STI->getCPU()); @@ -283,7 +396,7 @@ if (hasXNACK(*STI)) Stream << "+xnack"; if (hasSRAMECC(*STI)) - Stream << "+sram-ecc"; + Stream << "+sramecc"; Stream.flush(); } @@ -1624,4 +1737,24 @@ } } // namespace AMDGPU + +raw_ostream &operator<<(raw_ostream &OS, + const AMDGPU::IsaInfo::TargetIDSetting S) { + switch (S) { + case (AMDGPU::IsaInfo::TargetIDSetting::Unsupported): + OS << "Unsupported"; + break; + case (AMDGPU::IsaInfo::TargetIDSetting::Any): + OS << "Any"; + break; + case (AMDGPU::IsaInfo::TargetIDSetting::Off): + OS << "Off"; + break; + case (AMDGPU::IsaInfo::TargetIDSetting::On): + OS << "On"; + break; + } + return OS; +} + } // namespace llvm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s ; Check lowering of some large extractelement that use the stack ; instead of register indexing. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -4,32 +4,14 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(<4 x i128> addrspace(4)* inreg %ptr, i32 inreg %idx) { -; GFX9-LABEL: extractelement_sgpr_v4i128_sgpr_idx: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 -; GFX9-NEXT: s_lshl_b32 m0, s4, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_movrels_b64 s[0:1], s[8:9] -; GFX9-NEXT: s_movrels_b64 s[2:3], s[10:11] -; GFX9-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: extractelement_sgpr_v4i128_sgpr_idx: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 -; GFX8-NEXT: s_lshl_b32 m0, s4, 1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_movrels_b64 s[0:1], s[8:9] -; GFX8-NEXT: s_movrels_b64 s[2:3], s[10:11] -; GFX8-NEXT: ; return to shader part epilog -; -; GFX7-LABEL: extractelement_sgpr_v4i128_sgpr_idx: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 -; GFX7-NEXT: s_lshl_b32 m0, s4, 1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_movrels_b64 s[0:1], s[8:9] -; GFX7-NEXT: s_movrels_b64 s[2:3], s[10:11] -; GFX7-NEXT: ; return to shader part epilog +; GCN-LABEL: extractelement_sgpr_v4i128_sgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 +; GCN-NEXT: s_lshl_b32 m0, s4, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_movrels_b64 s[0:1], s[8:9] +; GCN-NEXT: s_movrels_b64 s[2:3], s[10:11] +; GCN-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 %idx ret i128 %element @@ -112,57 +94,57 @@ ; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 1, v2 -; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX9-NEXT: v_add_u32_e32 v17, 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e64 v10, v2, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v8, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v9, v11, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, v13, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v10, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v17, v11, v7, vcc ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v3 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v14, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -1672,7 +1672,7 @@ ; GPRIDX-NEXT: is_ptr64 = 1 ; GPRIDX-NEXT: is_dynamic_callstack = 0 ; GPRIDX-NEXT: is_debug_enabled = 0 -; GPRIDX-NEXT: is_xnack_enabled = 0 +; GPRIDX-NEXT: is_xnack_enabled = 1 ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 @@ -2187,7 +2187,7 @@ ; GPRIDX-NEXT: is_ptr64 = 1 ; GPRIDX-NEXT: is_dynamic_callstack = 0 ; GPRIDX-NEXT: is_debug_enabled = 0 -; GPRIDX-NEXT: is_xnack_enabled = 0 +; GPRIDX-NEXT: is_xnack_enabled = 1 ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 @@ -2363,7 +2363,7 @@ ; GPRIDX-NEXT: is_ptr64 = 1 ; GPRIDX-NEXT: is_dynamic_callstack = 0 ; GPRIDX-NEXT: is_debug_enabled = 0 -; GPRIDX-NEXT: is_xnack_enabled = 0 +; GPRIDX-NEXT: is_xnack_enabled = 1 ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -14,29 +14,29 @@ ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_add_u32_e32 v31, 64, v16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x0 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x0 ; GCN-NEXT: s_load_dwordx16 s[52:67], s[10:11], 0x40 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x80 +; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x80 ; GCN-NEXT: v_add_u32_e32 v32, 0x44, v16 ; GCN-NEXT: v_add_u32_e32 v33, 0x48, v16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: v_mov_b32_e32 v4, s16 -; GCN-NEXT: v_mov_b32_e32 v5, s17 -; GCN-NEXT: v_mov_b32_e32 v6, s18 -; GCN-NEXT: v_mov_b32_e32 v7, s19 -; GCN-NEXT: v_mov_b32_e32 v8, s20 -; GCN-NEXT: v_mov_b32_e32 v9, s21 -; GCN-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NEXT: v_mov_b32_e32 v11, s23 -; GCN-NEXT: v_mov_b32_e32 v12, s24 -; GCN-NEXT: v_mov_b32_e32 v13, s25 -; GCN-NEXT: v_mov_b32_e32 v14, s26 -; GCN-NEXT: v_mov_b32_e32 v15, s27 -; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0xc0 +; GCN-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NEXT: v_mov_b32_e32 v2, s38 +; GCN-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NEXT: v_mov_b32_e32 v4, s40 +; GCN-NEXT: v_mov_b32_e32 v5, s41 +; GCN-NEXT: v_mov_b32_e32 v6, s42 +; GCN-NEXT: v_mov_b32_e32 v7, s43 +; GCN-NEXT: v_mov_b32_e32 v8, s44 +; GCN-NEXT: v_mov_b32_e32 v9, s45 +; GCN-NEXT: v_mov_b32_e32 v10, s46 +; GCN-NEXT: v_mov_b32_e32 v11, s47 +; GCN-NEXT: v_mov_b32_e32 v12, s48 +; GCN-NEXT: v_mov_b32_e32 v13, s49 +; GCN-NEXT: v_mov_b32_e32 v14, s50 +; GCN-NEXT: v_mov_b32_e32 v15, s51 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0xc0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:256 ; GCN-NEXT: v_add_u32_e32 v0, 4, v16 ; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -88,110 +88,110 @@ ; GCN-NEXT: v_add_u32_e32 v46, 0x7c, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s67 ; GCN-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s12 ; GCN-NEXT: v_add_u32_e32 v47, 0x80, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s36 ; GCN-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_add_u32_e32 v48, 0x84, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: buffer_store_dword v1, v48, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s14 ; GCN-NEXT: v_add_u32_e32 v49, 0x88, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s38 ; GCN-NEXT: buffer_store_dword v1, v49, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s11, 0x90 +; GCN-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NEXT: v_add_u32_e32 v50, 0x8c, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NEXT: s_movk_i32 s11, 0x90 ; GCN-NEXT: buffer_store_dword v1, v50, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s16 ; GCN-NEXT: v_add_u32_e32 v51, s11, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s40 ; GCN-NEXT: buffer_store_dword v1, v51, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v52, 0x94, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s41 +; GCN-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NEXT: buffer_store_dword v1, v52, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v53, 0x98, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NEXT: v_mov_b32_e32 v1, s18 ; GCN-NEXT: buffer_store_dword v1, v53, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s28, 0xa0 +; GCN-NEXT: s_movk_i32 s12, 0xa0 ; GCN-NEXT: v_add_u32_e32 v54, 0x9c, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NEXT: buffer_store_dword v1, v54, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v55, s28, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s44 +; GCN-NEXT: v_add_u32_e32 v55, s12, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s20 ; GCN-NEXT: buffer_store_dword v1, v55, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v56, 0xa4, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NEXT: v_mov_b32_e32 v1, s21 ; GCN-NEXT: buffer_store_dword v1, v56, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v57, 0xa8, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s46 +; GCN-NEXT: v_mov_b32_e32 v1, s22 ; GCN-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s29, 0xb0 +; GCN-NEXT: s_movk_i32 s13, 0xb0 ; GCN-NEXT: v_add_u32_e32 v58, 0xac, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NEXT: buffer_store_dword v1, v58, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v59, s29, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s48 +; GCN-NEXT: v_add_u32_e32 v59, s13, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s24 ; GCN-NEXT: buffer_store_dword v1, v59, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v60, 0xb4, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NEXT: buffer_store_dword v1, v60, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v61, 0xb8, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s50 +; GCN-NEXT: v_mov_b32_e32 v1, s26 ; GCN-NEXT: buffer_store_dword v1, v61, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v62, 0xbc, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NEXT: buffer_store_dword v1, v62, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s12 ; GCN-NEXT: v_add_u32_e32 v63, 0xc0, v16 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s36 ; GCN-NEXT: buffer_store_dword v1, v63, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_add_u32_e32 v64, 0xc4, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: buffer_store_dword v1, v64, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v1, s14 ; GCN-NEXT: v_add_u32_e32 v65, 0xc8, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s38 ; GCN-NEXT: buffer_store_dword v1, v65, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s12, 0xd0 +; GCN-NEXT: s_movk_i32 s14, 0xd0 ; GCN-NEXT: v_add_u32_e32 v66, 0xcc, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NEXT: v_mov_b32_e32 v1, s39 ; GCN-NEXT: buffer_store_dword v1, v66, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v67, s12, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: v_add_u32_e32 v67, s14, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s40 ; GCN-NEXT: buffer_store_dword v1, v67, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v68, 0xd4, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: v_mov_b32_e32 v1, s41 ; GCN-NEXT: buffer_store_dword v1, v68, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v69, 0xd8, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s18 +; GCN-NEXT: v_mov_b32_e32 v1, s42 ; GCN-NEXT: buffer_store_dword v1, v69, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s13, 0xe0 +; GCN-NEXT: s_movk_i32 s15, 0xe0 ; GCN-NEXT: v_add_u32_e32 v70, 0xdc, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NEXT: v_mov_b32_e32 v1, s43 ; GCN-NEXT: buffer_store_dword v1, v70, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v71, s13, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NEXT: v_add_u32_e32 v71, s15, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s44 ; GCN-NEXT: buffer_store_dword v1, v71, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v72, 0xe4, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NEXT: v_mov_b32_e32 v1, s45 ; GCN-NEXT: buffer_store_dword v1, v72, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v73, 0xe8, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s22 +; GCN-NEXT: v_mov_b32_e32 v1, s46 ; GCN-NEXT: buffer_store_dword v1, v73, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s14, 0xf0 +; GCN-NEXT: s_movk_i32 s16, 0xf0 ; GCN-NEXT: v_add_u32_e32 v74, 0xec, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NEXT: v_mov_b32_e32 v1, s47 ; GCN-NEXT: buffer_store_dword v1, v74, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v75, s14, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s24 +; GCN-NEXT: v_add_u32_e32 v75, s16, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s48 ; GCN-NEXT: buffer_store_dword v1, v75, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v76, 0xf4, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NEXT: v_mov_b32_e32 v1, s49 ; GCN-NEXT: s_and_b32 s7, s7, 63 ; GCN-NEXT: buffer_store_dword v1, v76, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v77, 0xf8, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s26 +; GCN-NEXT: v_mov_b32_e32 v1, s50 ; GCN-NEXT: v_add_u32_e32 v17, 8, v16 ; GCN-NEXT: buffer_store_dword v1, v77, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v78, 0xfc, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NEXT: v_mov_b32_e32 v1, s51 ; GCN-NEXT: s_lshl_b32 s7, s7, 2 ; GCN-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v18, 12, v16 @@ -225,6 +225,7 @@ ; GCN-NEXT: buffer_store_dword v15, v30, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v2, v17, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v3, v18, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v4, v19, s[0:3], 0 offen @@ -337,12 +338,12 @@ ; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s28 +; GCN-NEXT: s_add_u32 s4, s8, s12 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[36:39], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s29 +; GCN-NEXT: s_add_u32 s4, s8, s13 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[40:43], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 @@ -352,17 +353,17 @@ ; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s12 +; GCN-NEXT: s_add_u32 s4, s8, s14 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[48:51], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s13 +; GCN-NEXT: s_add_u32 s4, s8, s15 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[52:55], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s14 +; GCN-NEXT: s_add_u32 s4, s8, s16 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[56:59], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -2133,8 +2133,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v16i16_s_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: s_and_b32 s1, s3, 1 ; GFX9-NEXT: s_lshr_b32 s12, s3, 1 ; GFX9-NEXT: s_mov_b32 s0, 0xffff @@ -2152,27 +2152,27 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX9-NEXT: v_and_or_b32 v10, v1, s13, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[10:11] +; GFX9-NEXT: v_and_or_b32 v12, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v10, 16 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off @@ -2846,8 +2846,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v16i16_s_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX9-NEXT: s_mov_b32 s0, 0xffff @@ -2865,26 +2865,26 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] -; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] +; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v10, 16 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off @@ -2995,8 +2995,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v16i16_v_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: s_and_b32 s1, s2, 1 ; GFX9-NEXT: s_lshr_b32 s12, s2, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 @@ -3012,27 +3012,27 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] -; GFX9-NEXT: v_and_or_b32 v11, v1, s13, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[10:11] +; GFX9-NEXT: v_and_or_b32 v12, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v10, 16 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off @@ -3143,8 +3143,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v16i16_v_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 @@ -3161,26 +3161,26 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] ; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v10, 16 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -38,6 +38,7 @@ ; GCN-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:48 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128 ; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192 ; GCN-NEXT: s_waitcnt vmcnt(7) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -1520,11 +1520,11 @@ ; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: ds_inc_rtn_u32 v4, v1, v0 ; GFX9-NEXT: ds_inc_rtn_u32 v5, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -256,39 +256,39 @@ ; GFX10_W32-LABEL: test_div_fmas_f32: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x4 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dword s7, s[0:1], 0xb8 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s4 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10_W32-NEXT: v_div_fmas_f32 v2, s5, v0, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s7 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x4 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dword s7, s[0:1], 0xb8 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s4 -; GFX10_W64-NEXT: v_div_fmas_f32 v2, s5, v0, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s7 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) @@ -336,35 +336,35 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v2, 1.0, s4, v0 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v2, 1.0, s4, v0 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) @@ -412,35 +412,35 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x58 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x58 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x34 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, 1.0, v0 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x58 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x58 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x34 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, 1.0, v0 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) @@ -488,35 +488,35 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, 1.0 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, 1.0 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) @@ -568,37 +568,37 @@ ; GFX10_W32-LABEL: test_div_fmas_f64: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x1 -; GFX10_W32-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX10_W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10_W32-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s8, 1, s8 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s4 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s6 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s5 -; GFX10_W32-NEXT: v_mov_b32_e32 v3, s7 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 -; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] -; GFX10_W32-NEXT: v_mov_b32_e32 v3, s1 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s0 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s8 +; GFX10_W32-NEXT: v_mov_b32_e32 v2, s10 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s9 +; GFX10_W32-NEXT: v_mov_b32_e32 v3, s11 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] +; GFX10_W32-NEXT: v_mov_b32_e32 v2, s4 +; GFX10_W32-NEXT: v_mov_b32_e32 v3, s5 ; GFX10_W32-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f64: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x1 -; GFX10_W64-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX10_W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10_W64-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s8, 1, s8 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX10_W64-NEXT: v_mov_b32_e32 v2, s6 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s5 -; GFX10_W64-NEXT: v_mov_b32_e32 v3, s7 -; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] -; GFX10_W64-NEXT: v_mov_b32_e32 v3, s1 -; GFX10_W64-NEXT: v_mov_b32_e32 v2, s0 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s8 +; GFX10_W64-NEXT: v_mov_b32_e32 v2, s10 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s9 +; GFX10_W64-NEXT: v_mov_b32_e32 v3, s11 +; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] +; GFX10_W64-NEXT: v_mov_b32_e32 v2, s4 +; GFX10_W64-NEXT: v_mov_b32_e32 v3, s5 ; GFX10_W64-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10_W64-NEXT: s_endpgm %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) @@ -649,18 +649,18 @@ ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x1 ; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; @@ -668,17 +668,17 @@ ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x1 ; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W64-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %cmp = icmp eq i32 %i, 0 @@ -725,35 +725,35 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, 0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 -; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 -; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) @@ -799,35 +799,35 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, 1 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 -; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, 1 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 -; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) @@ -906,18 +906,18 @@ ; GFX10_W32-NEXT: s_clause 0x2 ; GFX10_W32-NEXT: global_load_dword v2, v1, s[6:7] ; GFX10_W32-NEXT: global_load_dword v3, v1, s[6:7] offset:4 -; GFX10_W32-NEXT: global_load_dword v1, v1, s[6:7] offset:8 +; GFX10_W32-NEXT: global_load_dword v4, v1, s[6:7] offset:8 ; GFX10_W32-NEXT: s_add_u32 s0, s4, 8 ; GFX10_W32-NEXT: s_addc_u32 s1, s5, 0 ; GFX10_W32-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s2, 0, s2 ; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s2 ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) -; GFX10_W32-NEXT: v_div_fmas_f32 v2, v2, v3, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W32-NEXT: v_div_fmas_f32 v2, v2, v3, v4 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; @@ -931,18 +931,18 @@ ; GFX10_W64-NEXT: s_clause 0x2 ; GFX10_W64-NEXT: global_load_dword v2, v1, s[6:7] ; GFX10_W64-NEXT: global_load_dword v3, v1, s[6:7] offset:4 -; GFX10_W64-NEXT: global_load_dword v1, v1, s[6:7] offset:8 +; GFX10_W64-NEXT: global_load_dword v4, v1, s[6:7] offset:8 ; GFX10_W64-NEXT: s_add_u32 s0, s4, 8 ; GFX10_W64-NEXT: s_addc_u32 s1, s5, 0 ; GFX10_W64-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 ; GFX10_W64-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 ; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 ; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[2:3] ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-NEXT: v_div_fmas_f32 v2, v2, v3, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W64-NEXT: v_div_fmas_f32 v2, v2, v3, v4 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -51,9 +51,9 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v1 +; GFX10-NEXT: v_div_scale_f32 v2, s2, v2, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -118,9 +118,9 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v1, v0, v1 +; GFX10-NEXT: v_div_scale_f32 v2, s2, v1, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -184,13 +184,13 @@ ; GFX10-LABEL: test_div_scale_f64_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v3, s1 @@ -256,13 +256,13 @@ ; GFX10-LABEL: test_div_scale_f64_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v3, s1 @@ -808,14 +808,14 @@ ; GFX10-LABEL: test_div_scale_f32_all_scalar_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, s3, s3, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_div_scale_f32 v2, s0, s5, s5, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) @@ -854,14 +854,14 @@ ; GFX10-LABEL: test_div_scale_f32_all_scalar_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, s2, s3, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_div_scale_f32 v2, s0, s4, s5, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) @@ -902,14 +902,14 @@ ; GFX10-LABEL: test_div_scale_f64_all_scalar_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x74 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3] -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[6:7], s[6:7], s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) @@ -950,14 +950,14 @@ ; GFX10-LABEL: test_div_scale_f64_all_scalar_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x74 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3] -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[4:5], s[6:7], s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) @@ -1133,11 +1133,11 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v1 +; GFX10-NEXT: v_div_scale_f32 v2, s2, v2, v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -1206,9 +1206,9 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v2 ; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll @@ -16,19 +16,6 @@ ; UNPACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16 ; UNPACKED-NEXT: s_endpgm ; -; PACKED-LABEL: image_store_f16: -; PACKED: ; %bb.0: -; PACKED-NEXT: s_mov_b32 s0, s2 -; PACKED-NEXT: s_mov_b32 s1, s3 -; PACKED-NEXT: s_mov_b32 s2, s4 -; PACKED-NEXT: s_mov_b32 s3, s5 -; PACKED-NEXT: s_mov_b32 s4, s6 -; PACKED-NEXT: s_mov_b32 s5, s7 -; PACKED-NEXT: s_mov_b32 s6, s8 -; PACKED-NEXT: s_mov_b32 s7, s9 -; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16 -; PACKED-NEXT: s_endpgm -; ; GFX81-LABEL: image_store_f16: ; GFX81: ; %bb.0: ; GFX81-NEXT: s_mov_b32 s0, s2 @@ -41,6 +28,18 @@ ; GFX81-NEXT: s_mov_b32 s7, s9 ; GFX81-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16 ; GFX81-NEXT: s_endpgm +; PACKED-LABEL: image_store_f16: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_mov_b32 s0, s2 +; PACKED-NEXT: s_mov_b32 s1, s3 +; PACKED-NEXT: s_mov_b32 s2, s4 +; PACKED-NEXT: s_mov_b32 s3, s5 +; PACKED-NEXT: s_mov_b32 s4, s6 +; PACKED-NEXT: s_mov_b32 s5, s7 +; PACKED-NEXT: s_mov_b32 s6, s8 +; PACKED-NEXT: s_mov_b32 s7, s9 +; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16 +; PACKED-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } @@ -60,19 +59,6 @@ ; UNPACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16 ; UNPACKED-NEXT: s_endpgm ; -; PACKED-LABEL: image_store_v2f16: -; PACKED: ; %bb.0: -; PACKED-NEXT: s_mov_b32 s0, s2 -; PACKED-NEXT: s_mov_b32 s1, s3 -; PACKED-NEXT: s_mov_b32 s2, s4 -; PACKED-NEXT: s_mov_b32 s3, s5 -; PACKED-NEXT: s_mov_b32 s4, s6 -; PACKED-NEXT: s_mov_b32 s5, s7 -; PACKED-NEXT: s_mov_b32 s6, s8 -; PACKED-NEXT: s_mov_b32 s7, s9 -; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16 -; PACKED-NEXT: s_endpgm -; ; GFX81-LABEL: image_store_v2f16: ; GFX81: ; %bb.0: ; GFX81-NEXT: s_mov_b32 s0, s2 @@ -85,6 +71,18 @@ ; GFX81-NEXT: s_mov_b32 s7, s9 ; GFX81-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16 ; GFX81-NEXT: s_endpgm +; PACKED-LABEL: image_store_v2f16: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_mov_b32 s0, s2 +; PACKED-NEXT: s_mov_b32 s1, s3 +; PACKED-NEXT: s_mov_b32 s2, s4 +; PACKED-NEXT: s_mov_b32 s3, s5 +; PACKED-NEXT: s_mov_b32 s4, s6 +; PACKED-NEXT: s_mov_b32 s5, s7 +; PACKED-NEXT: s_mov_b32 s6, s8 +; PACKED-NEXT: s_mov_b32 s7, s9 +; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16 +; PACKED-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } @@ -114,19 +112,6 @@ ; UNPACKED-NEXT: image_store v[1:4], v[5:6], s[0:7] dmask:0xf unorm d16 ; UNPACKED-NEXT: s_endpgm ; -; PACKED-LABEL: image_store_v4f16: -; PACKED: ; %bb.0: -; PACKED-NEXT: s_mov_b32 s0, s2 -; PACKED-NEXT: s_mov_b32 s1, s3 -; PACKED-NEXT: s_mov_b32 s2, s4 -; PACKED-NEXT: s_mov_b32 s3, s5 -; PACKED-NEXT: s_mov_b32 s4, s6 -; PACKED-NEXT: s_mov_b32 s5, s7 -; PACKED-NEXT: s_mov_b32 s6, s8 -; PACKED-NEXT: s_mov_b32 s7, s9 -; PACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16 -; PACKED-NEXT: s_endpgm -; ; GFX81-LABEL: image_store_v4f16: ; GFX81: ; %bb.0: ; GFX81-NEXT: s_mov_b32 s0, s2 @@ -139,6 +124,18 @@ ; GFX81-NEXT: s_mov_b32 s7, s9 ; GFX81-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm d16 ; GFX81-NEXT: s_endpgm +; PACKED-LABEL: image_store_v4f16: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_mov_b32 s0, s2 +; PACKED-NEXT: s_mov_b32 s1, s3 +; PACKED-NEXT: s_mov_b32 s2, s4 +; PACKED-NEXT: s_mov_b32 s3, s5 +; PACKED-NEXT: s_mov_b32 s4, s6 +; PACKED-NEXT: s_mov_b32 s5, s7 +; PACKED-NEXT: s_mov_b32 s6, s8 +; PACKED-NEXT: s_mov_b32 s7, s9 +; PACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16 +; PACKED-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -21,13 +21,13 @@ ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; encoding: [0x00,0x01,0x00,0xf4,0x2c,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; encoding: [0x80,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; encoding: [0x02,0x02,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; encoding: [0x01,0x02,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; encoding: [0x04,0x02,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x04,0x7e,0x02,0x01,0x08,0x11] ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x00,0x02,0x7d,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll @@ -4,6 +4,21 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -19,7 +34,12 @@ ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret half %val +} + +define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 @@ -31,14 +51,9 @@ ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) - ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) - ret half %val -} - -define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -65,21 +80,6 @@ ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; PACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset - ; PACKED: bb.1 (%ir-block.0): - ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <2 x half> %val } @@ -91,6 +91,24 @@ ; } define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 + ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 + ; PACKED: $vgpr0 = COPY [[COPY7]] + ; PACKED: $vgpr1 = COPY [[COPY8]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; UNPACKED-LABEL: name: struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -127,29 +145,27 @@ ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret <4 x half> %val +} + +define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 ; PACKED: bb.1 (%ir-block.0): - ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "TargetCustom7", align 1, addrspace 4) - ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 - ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 - ; PACKED: $vgpr0 = COPY [[COPY7]] - ; PACKED: $vgpr1 = COPY [[COPY8]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) - ret <4 x half> %val -} - -define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -166,27 +182,57 @@ ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 - ; PACKED: bb.1 (%ir-block.0): - ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 - ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) - ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 78, i32 0) ret half %val } define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: successors: %bb.2(0x80000000) + ; PACKED: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; PACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; PACKED: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; PACKED: bb.2: + ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec + ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; PACKED: bb.3: + ; PACKED: successors: %bb.4(0x80000000) + ; PACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; PACKED: bb.4: + ; PACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 + ; PACKED: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 + ; PACKED: $vgpr0 = COPY [[COPY11]] + ; PACKED: $vgpr1 = COPY [[COPY12]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; UNPACKED-LABEL: name: struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: successors: %bb.2(0x80000000) @@ -251,72 +297,11 @@ ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset - ; PACKED: bb.1 (%ir-block.0): - ; PACKED: successors: %bb.2(0x80000000) - ; PACKED: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; PACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; PACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; PACKED: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; PACKED: bb.2: - ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec - ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec - ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "TargetCustom7", align 1, addrspace 4) - ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; PACKED: bb.3: - ; PACKED: successors: %bb.4(0x80000000) - ; PACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; PACKED: bb.4: - ; PACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 - ; PACKED: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 - ; PACKED: $vgpr0 = COPY [[COPY11]] - ; PACKED: $vgpr1 = COPY [[COPY12]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <4 x half> %val } define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { - ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 - ; UNPACKED: bb.1 (%ir-block.0): - ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; UNPACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; UNPACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7" + 4095, align 1, addrspace 4) - ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] - ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 ; PACKED: bb.1 (%ir-block.0): ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -332,6 +317,21 @@ ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7" + 4095, align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7" + 4095, align 1, addrspace 4) + ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret half %val diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -20,15 +20,15 @@ ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -17,56 +17,56 @@ ; ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 11, v0 -; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v[2:3], off offset:-10 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[2:3], off offset:-9 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[2:3], off offset:-8 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-7 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-6 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-5 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-4 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off offset:-3 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-2 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off offset:-1 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xff -; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8 -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v3 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v11, v13, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v12, v3 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v5, v6, v3, v7 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v9 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v3, v10, v3, v11 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v4 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v7 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v3, v8, v2 -; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 11, v0 +; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v15, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v14, v[2:3], off offset:-10 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v13, v[2:3], off offset:-9 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off offset:-8 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-7 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off offset:-6 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-5 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-4 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-3 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-2 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[2:3], off offset:-1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v2, s5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v13 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, s4, v12 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v15, s4, v2 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v4, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 24, v12 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v10, v11, v0, v10 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v7, v7, v0, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v6 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v10, v9, v8 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v7, v5, v4 +; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1: ; GFX7-UNALIGNED: ; %bb.0: @@ -155,30 +155,30 @@ ; ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 10, v0 -; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off offset:-8 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[2:3], off offset:-6 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off offset:-4 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-2 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v4, v3, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v6, v3, v2 -; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 10, v0 +; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v9, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v8, v[2:3], off offset:-8 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v7, v[2:3], off offset:-6 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-4 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off offset:-2 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v8 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v6, v2 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v7, v2, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v9, s4, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v5, v2, v3 +; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX7-UNALIGNED: ; %bb.0: @@ -192,36 +192,36 @@ ; ; GFX7-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX7-NOUNALIGNED: ; %bb.0: -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s6, 0 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NOUNALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:10 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v4 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v5 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v6 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v2 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v3, v4 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v5, v6 -; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s6, 0 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NOUNALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:10 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v4 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v6 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v2 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v3, v4 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v5, v6 +; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 2 ret <3 x i32> %load } @@ -398,101 +398,101 @@ ; ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 1 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 3 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 5 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 7 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v15, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v14, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v17, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v18, v[4:5], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v19, v[6:7], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[8:9], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[10:11], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[12:13], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[14:15], off -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 9 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 10 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 11 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s1 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[4:5], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[6:7], off -; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 8 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, s1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v18 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, s0, v19 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v16, s0, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v7 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v10, v5 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v11, v5 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v8, v5, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v0, v1, v7 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v3, v5 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v4, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v12, v5, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v0, v1, v2 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 1 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 3 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 5 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 7 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v15, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v14, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v23, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v22, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v21, v[4:5], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v20, v[6:7], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v19, v[8:9], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v18, v[10:11], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v17, v[12:13], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[14:15], off +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 9 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 10 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 11 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s1 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[4:5], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[6:7], off +; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, s1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v21 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v20 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v23, s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v4 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v17, v2 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v16, v2 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v19, v2, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v0, v1, v4 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v9, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v11, v2, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v8, v2 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v0, v1, v2 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog ; ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX7-UNALIGNED: ; %bb.0: @@ -588,52 +588,52 @@ ; ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 10 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s1 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[4:5], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v[6:7], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[8:9], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[10:11], off -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v3, v6 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v2, v6, v0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v5, v6 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v4, v6, v0 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 10 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v17, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v16, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v15, v[4:5], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v14, v[6:7], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v13, v[8:9], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v12, v[10:11], off +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v16 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v17, s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v14, v2 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v15, v2, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v12, v2 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v13, v2, v0 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog ; ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX7-UNALIGNED: ; %bb.0: @@ -651,35 +651,35 @@ ; ; GFX7-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX7-NOUNALIGNED: ; %bb.0: -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:8 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:10 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 -; GFX7-NOUNALIGNED-NEXT: ; return to shader part epilog +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:8 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:10 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NOUNALIGNED-NEXT: ; return to shader part epilog %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 2 ret <3 x i32> %load } @@ -687,10 +687,10 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(<3 x i32> addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -709,10 +709,10 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(i96 addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_i96_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -731,10 +731,10 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(<3 x i32> addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -753,10 +753,10 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(<6 x i16> addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_v6i16_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -97,17 +97,17 @@ ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, gv3@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, gv3@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: global_store_dword v[0:1], v2, off -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: BB1_2: ; %Flow ; GFX9-NEXT: s_xor_b32 s0, s0, -1 @@ -121,16 +121,16 @@ ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, gv1@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, gv1@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dword v[0:1], v2, off -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: BB1_4: ; %bb2 ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -36,17 +36,17 @@ ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_load_dword s4, s[4:5], 0x10 -; GCN-NEXT: s_add_u32 s5, s32, 0x1000 +; GCN-NEXT: s_load_dword s8, s[4:5], 0x10 +; GCN-NEXT: s_add_u32 s4, s32, 0x1000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: s_add_u32 s5, s4, 4 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: s_add_u32 s8, s5, 4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 2 -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: s_lshl_b32 s5, s8, 2 ; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NEXT: s_add_u32 s4, s5, s4 +; GCN-NEXT: s_add_u32 s4, s4, s5 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen @@ -110,18 +110,18 @@ ; GCN-NEXT: s_cbranch_scc1 BB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_load_dword s4, s[4:5], 0xc -; GCN-NEXT: s_add_u32 s5, s32, 0x1000 -; GCN-NEXT: s_and_b32 s5, s5, 0xfffff000 -; GCN-NEXT: s_add_u32 s8, s5, 4 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xc +; GCN-NEXT: s_add_u32 s4, s32, 0x1000 +; GCN-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GCN-NEXT: s_add_u32 s5, s4, 4 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 2 -; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s5, s8, 2 ; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NEXT: s_add_u32 s4, s5, s4 +; GCN-NEXT: s_add_u32 s4, s4, s5 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -8,14 +8,14 @@ define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; @@ -39,51 +39,51 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s5, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 -; GFX9-NEXT: s_lshr_b32 s7, s0, 24 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s3, s4, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b32 s0, s5, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-NEXT: s_lshr_b32 s1, s5, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s5, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_lshr_b32 s0, s2, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 24 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s6, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_lshr_b32 s0, s3, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: s_lshr_b32 s0, s7, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s3, 16 +; GFX9-NEXT: s_lshr_b32 s1, s7, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:13 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s2, s3, 24 +; GFX9-NEXT: s_lshr_b32 s2, s7, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:14 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:15 @@ -148,27 +148,27 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b32 s0, s5, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshr_b32 s0, s6, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_lshr_b32 s0, s3, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: s_lshr_b32 s0, s7, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:14 @@ -209,15 +209,15 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: ds_write2_b32 v1, v3, v0 offset0:2 offset1:3 ; GFX9-NEXT: s_endpgm ; @@ -242,14 +242,14 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; @@ -273,14 +273,14 @@ define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -8,13 +8,13 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; @@ -37,42 +37,42 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: s_lshr_b32 s0, s12, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-NEXT: s_lshr_b32 s6, s0, 24 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s1, s12, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s3, s12, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: s_lshr_b32 s0, s13, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshr_b32 s4, s1, 24 +; GFX9-NEXT: s_lshr_b32 s1, s13, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s13, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_lshr_b32 s0, s2, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-NEXT: s_lshr_b32 s0, s14, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s2, 16 +; GFX9-NEXT: s_lshr_b32 s1, s14, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s3, s2, 24 +; GFX9-NEXT: s_lshr_b32 s2, s14, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX9-NEXT: s_endpgm ; @@ -124,22 +124,22 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: s_lshr_b32 s0, s12, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: s_lshr_b32 s0, s13, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-NEXT: s_lshr_b32 s0, s14, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 @@ -175,13 +175,13 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s14 ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX9-NEXT: s_endpgm @@ -206,13 +206,13 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s14 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX9-NEXT: s_endpgm @@ -237,13 +237,13 @@ define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -360,7 +360,6 @@ ; IR-LABEL: @select_mul_lhs_const_i32( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000 ; IR-NEXT: ret i32 [[OP]] -; %select = select i1 %cond, i32 5, i32 8 %op = mul i32 1000, %select ret i32 %op @@ -380,7 +379,6 @@ ; IR-LABEL: @select_mul_rhs_const_i32( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000 ; IR-NEXT: ret i32 [[OP]] -; %select = select i1 %cond, i32 5, i32 8 %op = mul i32 %select, 1000 ret i32 %op @@ -420,7 +418,6 @@ ; IR-LABEL: @select_add_trunc_select( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50 ; IR-NEXT: ret i16 [[OP]] -; %select = select i1 %cond, i32 5, i32 8 %trunc = trunc i32 %select to i16 %op = add i16 %trunc, 42 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -177,33 +177,33 @@ ; ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB1_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 +; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB1_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm @@ -244,28 +244,28 @@ ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s1, s0, s1 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -277,20 +277,20 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s1, s0, s1 -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: s_mul_i32 s3, s2, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -299,9 +299,9 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -313,20 +313,20 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB1_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -335,9 +335,9 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 @@ -1720,33 +1720,33 @@ ; ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB9_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 +; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB9_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm @@ -1787,28 +1787,28 @@ ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB9_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s1, s0, s1 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB9_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -1820,20 +1820,20 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s1, s0, s1 -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: s_mul_i32 s3, s2, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -1842,9 +1842,9 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -1856,20 +1856,20 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB9_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -1878,9 +1878,9 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=-xnack -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s ; For gfx1010, overestimate the branch size in case we need to insert ; a nop for the buggy offset. diff --git a/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir --- a/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir +++ b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir @@ -1,7 +1,9 @@ # RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s -# Make sure the default assumption is xnack enabled with no cpu -# RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+volcanic-islands -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s +# Make sure the default assumption is xnack not supported when no cpu is specified. +# RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+volcanic-islands -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s + +# Make sure the assumption is xnack not supported when fiji cpu is specified. # RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s --- # Trivial clause at beginning of program diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -213,6 +213,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:8 ; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -323,6 +324,7 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_short_d16_hi v0, v[0:1], off ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -345,6 +347,7 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_load_ushort v2, v[0:1] offset:2 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: flat_load_short_d16_hi v0, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -1,7 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefixes=CHECK,GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefixes=CHECK,GCN %s ; RUN: FileCheck --enable-var-scope --check-prefixes=CHECK,DBG %s < %t ; REQUIRES: asserts +; FIXME: Verifier error with xnack enabled. + ; CHECK-LABEL: {{^}}cluster_load_cluster_store: define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noalias %sb) { bb: diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICI,SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICI %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICIVI,SICI,SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICIVI,SICI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,SICIVI,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}load_i32: ; GCN-DAG: s_mov_b32 s3, 0 @@ -9,8 +9,8 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 +; GFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 +; GFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 define amdgpu_vs float @load_i32(i32 addrspace(6)* inreg %p0, i32 addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds i32, i32 addrspace(6)* %p1, i32 2 %r0 = load i32, i32 addrspace(6)* %p0 @@ -21,13 +21,18 @@ } ; GCN-LABEL: {{^}}load_v2i32: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 +; SICIVI-DAG: s_mov_b32 s3, 0 +; SICIVI-DAG: s_mov_b32 s2, s1 +; SICIVI-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +; GFX9-DAG: s_mov_b32 s6, s1 +; GFX9-DAG: s_mov_b32 s7, 0 +; GFX9-DAG: s_mov_b32 s1, s7 +; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[6:7], 0x10 define amdgpu_vs <2 x float> @load_v2i32(<2 x i32> addrspace(6)* inreg %p0, <2 x i32> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(6)* %p1, i32 2 %r0 = load <2 x i32>, <2 x i32> addrspace(6)* %p0 @@ -43,8 +48,10 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 +; VI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 +; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 define amdgpu_vs <4 x float> @load_v4i32(<4 x i32> addrspace(6)* inreg %p0, <4 x i32> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(6)* %p1, i32 2 %r0 = load <4 x i32>, <4 x i32> addrspace(6)* %p0 @@ -60,8 +67,10 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +; VI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 define amdgpu_vs <8 x float> @load_v8i32(<8 x i32> addrspace(6)* inreg %p0, <8 x i32> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(6)* %p1, i32 2 %r0 = load <8 x i32>, <8 x i32> addrspace(6)* %p0 @@ -77,8 +86,10 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +; VI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 define amdgpu_vs <16 x float> @load_v16i32(<16 x i32> addrspace(6)* inreg %p0, <16 x i32> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <16 x i32>, <16 x i32> addrspace(6)* %p1, i32 2 %r0 = load <16 x i32>, <16 x i32> addrspace(6)* %p0 @@ -94,8 +105,10 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 +; VI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 +; VI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 +; GFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 +; GFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 define amdgpu_vs float @load_float(float addrspace(6)* inreg %p0, float addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds float, float addrspace(6)* %p1, i32 2 %r0 = load float, float addrspace(6)* %p0 @@ -105,13 +118,18 @@ } ; GCN-LABEL: {{^}}load_v2float: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 +; SICIVI-DAG: s_mov_b32 s3, 0 +; SICIVI-DAG: s_mov_b32 s2, s1 +; SICIVI-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +; GFX9-DAG: s_mov_b32 s6, s1 +; GFX9-DAG: s_mov_b32 s7, 0 +; GFX9-DAG: s_mov_b32 s1, s7 +; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[6:7], 0x10 define amdgpu_vs <2 x float> @load_v2float(<2 x float> addrspace(6)* inreg %p0, <2 x float> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(6)* %p1, i32 2 %r0 = load <2 x float>, <2 x float> addrspace(6)* %p0 @@ -126,8 +144,10 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 +; VI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 +; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 define amdgpu_vs <4 x float> @load_v4float(<4 x float> addrspace(6)* inreg %p0, <4 x float> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <4 x float>, <4 x float> addrspace(6)* %p1, i32 2 %r0 = load <4 x float>, <4 x float> addrspace(6)* %p0 @@ -142,8 +162,10 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +; VI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 define amdgpu_vs <8 x float> @load_v8float(<8 x float> addrspace(6)* inreg %p0, <8 x float> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <8 x float>, <8 x float> addrspace(6)* %p1, i32 2 %r0 = load <8 x float>, <8 x float> addrspace(6)* %p0 @@ -158,8 +180,10 @@ ; GCN-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +; VI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; VI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 define amdgpu_vs <16 x float> @load_v16float(<16 x float> addrspace(6)* inreg %p0, <16 x float> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <16 x float>, <16 x float> addrspace(6)* %p1, i32 2 %r0 = load <16 x float>, <16 x float> addrspace(6)* %p0 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -20,16 +20,16 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s0, s2 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 +; SI-NEXT: s_flbit_i32_b32 s5, s4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctlz_i32: @@ -376,23 +376,23 @@ define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s0, s2 -; SI-NEXT: s_flbit_i32_b32 s1, s3 -; SI-NEXT: s_add_i32 s0, s0, 32 -; SI-NEXT: s_or_b32 s2, s2, s3 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; SI-NEXT: s_flbit_i32_b32 s6, s4 +; SI-NEXT: s_flbit_i32_b32 s7, s5 +; SI-NEXT: s_add_i32 s6, s6, 32 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 +; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctlz_i64: @@ -440,22 +440,22 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s0, s2 -; SI-NEXT: s_flbit_i32_b32 s1, s3 -; SI-NEXT: s_add_i32 s0, s0, 32 -; SI-NEXT: s_or_b32 s2, s2, s3 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; SI-NEXT: s_flbit_i32_b32 s6, s4 +; SI-NEXT: s_flbit_i32_b32 s7, s5 +; SI-NEXT: s_add_i32 s6, s6, 32 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 +; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctlz_i64_trunc: diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll --- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll +++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll @@ -17,8 +17,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx704 < %s | FileCheck --check-prefixes=GFX704 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck --check-prefixes=GFX704 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx705 < %s | FileCheck --check-prefixes=GFX705 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 < %s | FileCheck --check-prefixes=GFX801 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=carrizo < %s | FileCheck --check-prefixes=GFX801 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx801 < %s | FileCheck --check-prefixes=GFX801 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=carrizo < %s | FileCheck --check-prefixes=GFX801 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 < %s | FileCheck --check-prefixes=GFX802 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland < %s | FileCheck --check-prefixes=GFX802 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck --check-prefixes=GFX802 %s @@ -38,11 +38,14 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack < %s | FileCheck --check-prefixes=XNACK-GFX900 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=-xnack < %s | FileCheck --check-prefixes=NO-XNACK-GFX902 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+sram-ecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX904 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sram-ecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+sramecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX904 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sramecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+sram-ecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX904 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sram-ecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+sramecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX904 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sramecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s + +; FIXME: With the default attributes these directives are not accurate for +; xnack and sramecc. Subsequent Target-ID patches will address this. ; GFX600: .amdgcn_target "amdgcn-amd-amdhsa--gfx600" ; GFX601: .amdgcn_target "amdgcn-amd-amdhsa--gfx601" @@ -53,24 +56,24 @@ ; GFX703: .amdgcn_target "amdgcn-amd-amdhsa--gfx703" ; GFX704: .amdgcn_target "amdgcn-amd-amdhsa--gfx704" ; GFX705: .amdgcn_target "amdgcn-amd-amdhsa--gfx705" -; GFX801: .amdgcn_target "amdgcn-amd-amdhsa--gfx801+xnack" +; GFX801: .amdgcn_target "amdgcn-amd-amdhsa--gfx801" ; GFX802: .amdgcn_target "amdgcn-amd-amdhsa--gfx802" ; GFX803: .amdgcn_target "amdgcn-amd-amdhsa--gfx803" ; GFX805: .amdgcn_target "amdgcn-amd-amdhsa--gfx805" -; GFX810: .amdgcn_target "amdgcn-amd-amdhsa--gfx810+xnack" +; GFX810: .amdgcn_target "amdgcn-amd-amdhsa--gfx810" ; GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900" -; GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902+xnack" +; GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902" ; GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904" ; GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906" ; XNACK-GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900+xnack" ; NO-XNACK-GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902" -; SRAM-ECC-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+sram-ecc" -; SRAM-ECC-GFX906: "amdgcn-amd-amdhsa--gfx906+sram-ecc" +; SRAM-ECC-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+sramecc" +; SRAM-ECC-GFX906: "amdgcn-amd-amdhsa--gfx906+sramecc" -; SRAM-ECC-XNACK-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack+sram-ecc" -; SRAM-ECC-XNACK-GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack+sram-ecc" +; SRAM-ECC-XNACK-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack+sramecc" +; SRAM-ECC-XNACK-GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack+sramecc" define amdgpu_kernel void @directive_amdgcn_target() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll @@ -59,6 +59,9 @@ ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1031 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1031 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1032 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1032 %s +; FIXME: With the default attributes the eflags are not accurate for +; xnack and sramecc. Subsequent Target-ID patches will address this. + ; ARCH-R600: Arch: r600 ; ARCH-GCN: Arch: amdgcn @@ -89,19 +92,15 @@ ; GFX704: EF_AMDGPU_MACH_AMDGCN_GFX704 (0x26) ; GFX705: EF_AMDGPU_MACH_AMDGCN_GFX705 (0x3B) ; GFX801: EF_AMDGPU_MACH_AMDGCN_GFX801 (0x28) -; GFX801-NEXT: EF_AMDGPU_XNACK (0x100) ; GFX802: EF_AMDGPU_MACH_AMDGCN_GFX802 (0x29) ; GFX803: EF_AMDGPU_MACH_AMDGCN_GFX803 (0x2A) ; GFX805: EF_AMDGPU_MACH_AMDGCN_GFX805 (0x3C) ; GFX810: EF_AMDGPU_MACH_AMDGCN_GFX810 (0x2B) -; GFX810-NEXT: EF_AMDGPU_XNACK (0x100) ; GFX900: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) ; GFX902: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D) -; GFX902-NEXT: EF_AMDGPU_XNACK (0x100) ; GFX904: EF_AMDGPU_MACH_AMDGCN_GFX904 (0x2E) ; GFX906: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) ; GFX908: EF_AMDGPU_MACH_AMDGCN_GFX908 (0x30) -; GFX908-NEXT: EF_AMDGPU_SRAM_ECC (0x200) ; GFX909: EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31) ; GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33) ; GFX1011: EF_AMDGPU_MACH_AMDGCN_GFX1011 (0x34) diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll @@ -1,24 +1,9 @@ -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX902 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 -mattr=-sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX902 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 -mattr=+sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX902 %s - ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=-sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc,+xnack < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s - -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx908 < %s | llvm-readobj -file-headers - | FileCheck --check-prefix=SRAM-ECC-GFX908 %s - -; NO-SRAM-ECC-GFX902: Flags [ -; NO-SRAM-ECC-GFX902-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D) -; NO-SRAM-ECC-GFX902-NEXT: EF_AMDGPU_XNACK (0x100) -; NO-SRAM-ECC-GFX902-NEXT: ] +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=-sramecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sramecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sramecc,+xnack < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s -; SRAM-ECC-GFX902: Flags [ -; SRAM-ECC-GFX902-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D) -; SRAM-ECC-GFX902-NEXT: EF_AMDGPU_SRAM_ECC (0x200) -; SRAM-ECC-GFX902-NEXT: EF_AMDGPU_XNACK (0x100) -; SRAM-ECC-GFX902-NEXT: ] +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx908 -mattr=+sramecc < %s | llvm-readobj -file-headers - | FileCheck --check-prefix=SRAM-ECC-GFX908 %s ; NO-SRAM-ECC-GFX906: Flags [ ; NO-SRAM-ECC-GFX906-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -33,9 +33,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ushort v3, v[0:1], off offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -31,9 +31,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen offset:2 +; GFX9-NEXT: buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -1,11 +1,11 @@ ; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=stoney -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=stoney -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo --amdhsa-code-object-version=2 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -7,26 +7,26 @@ ; SI-LABEL: frem_f16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:8 +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; SI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 +; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 ; SI-NEXT: v_rcp_f32_e32 v4, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 @@ -42,7 +42,7 @@ ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_f16: @@ -126,22 +126,22 @@ ; SI-LABEL: fast_frem_f16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:8 +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_rcp_f32_e32 v2, v1 @@ -149,7 +149,7 @@ ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: fast_frem_f16: @@ -216,22 +216,22 @@ ; SI-LABEL: unsafe_frem_f16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:8 +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_rcp_f32_e32 v2, v1 @@ -239,7 +239,7 @@ ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: unsafe_frem_f16: @@ -306,23 +306,23 @@ ; SI-LABEL: frem_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; SI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 +; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 ; SI-NEXT: v_rcp_f32_e32 v4, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 @@ -336,7 +336,7 @@ ; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_f32: @@ -421,26 +421,26 @@ ; SI-LABEL: fast_frem_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: fast_frem_f32: @@ -503,26 +503,26 @@ ; SI-LABEL: unsafe_frem_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: unsafe_frem_f32: @@ -585,20 +585,20 @@ ; SI-LABEL: frem_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s8 ; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] ; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -712,20 +712,20 @@ ; SI-LABEL: fast_frem_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s8 ; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] ; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -839,27 +839,27 @@ ; SI-LABEL: unsafe_frem_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s4 -; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] ; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 ; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s14 +; SI-NEXT: s_mov_b32 s0, s10 ; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 ; SI-NEXT: v_not_b32_e32 v6, v6 ; SI-NEXT: v_and_b32_e32 v6, v4, v6 @@ -873,7 +873,7 @@ ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc ; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: unsafe_frem_f64: @@ -1813,20 +1813,20 @@ ; SI-LABEL: frem_v2f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s8 ; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0 offset:64 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] ; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -43,17 +43,17 @@ ; ; GFX9-LABEL: fshl_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: s_lshr_b32 s1, s0, 1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_alignbit_b32 v2, s1, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_not_b32 s1, s6 +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 +; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -104,10 +104,10 @@ ; GFX9-LABEL: fshl_i32_imm: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, 25 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_alignbit_b32 v2, s4, v0, 25 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -184,20 +184,20 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s0, s5, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_not_b32 s1, s1 +; GFX9-NEXT: s_not_b32 s1, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s7, s5, 1 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s7, v0, v1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_not_b32 s1, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s1, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v2 +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -261,10 +261,10 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v2, 25 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -366,37 +366,37 @@ ; ; GFX9-LABEL: fshl_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s0, s7, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s11, s7, 1 +; GFX9-NEXT: s_not_b32 s1, s15 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, v1 +; GFX9-NEXT: v_alignbit_b32 v3, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_not_b32 s1, s14 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; GFX9-NEXT: s_lshr_b32 s3, s6, 1 -; GFX9-NEXT: v_alignbit_b32 v2, s3, v0, v1 +; GFX9-NEXT: s_lshr_b32 s0, s6, 1 +; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: s_not_b32 s1, s1 +; GFX9-NEXT: s_not_b32 s1, s13 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s5, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 +; GFX9-NEXT: s_lshr_b32 s0, s5, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_not_b32 s1, s12 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s1, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -472,19 +472,19 @@ ; ; GFX9-LABEL: fshl_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 23 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -46,14 +46,14 @@ ; ; GFX9-LABEL: fshr_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_alignbit_b32 v2, s4, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -101,10 +101,10 @@ ; GFX9-LABEL: fshr_i32_imm: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, 7 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_alignbit_b32 v2, s4, v0, 7 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -169,13 +169,13 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -236,10 +236,10 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v2, 7 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -317,25 +317,25 @@ ; ; GFX9-LABEL: fshr_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-NEXT: v_alignbit_b32 v2, s6, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -403,19 +403,19 @@ ; ; GFX9-LABEL: fshr_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 9 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -389,6 +389,7 @@ ; FIXME: Should not scalarize ; GCN-LABEL: {{^}}v5i16_func_void: ; GFX9: buffer_load_dwordx2 v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v2 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir @@ -1,6 +1,6 @@ -# RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GCX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GCX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s # GCN-LABEL: name: break_smem_clause_simple_load_smrd8_ptr_hidden_bundle # GCN: bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir @@ -1,6 +1,6 @@ -# RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s # GCN-LABEL: name: break_smem_clause_max_look_ahead_in_bundle # GCN: S_LOAD_DWORDX2_IMM diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -1,7 +1,7 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=WAVE64 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=WAVE64 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=WAVE64 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX1010 --check-prefix=WAVE32 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=WAVE64 --check-prefix=NOTES %s +; RUN: llc -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=WAVE64 --check-prefix=NOTES %s +; RUN: llc -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=WAVE64 --check-prefix=NOTES %s +; RUN: llc -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx1010 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX1010 --check-prefix=WAVE32 --check-prefix=NOTES %s @var = addrspace(1) global float 0.0 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=2 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=2 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=2 -mattr=-xnack -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -mattr=-xnack -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s @var = addrspace(1) global float 0.0 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -5,8 +5,8 @@ ; GFX9-LABEL: udiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_sub_i32 s3, 0, s2 @@ -65,8 +65,8 @@ ; GFX9-LABEL: urem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_sub_i32 s3, 0, s2 @@ -123,6 +123,7 @@ ; GFX9-LABEL: sdiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s3, 31 @@ -183,6 +184,7 @@ ; GFX9-LABEL: srem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s3, s2, 31 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -62,59 +62,59 @@ ; GFX9-NODL-LABEL: udot2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -198,48 +198,48 @@ ; GFX9-NODL-LABEL: udot2_MulMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v0, s5, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_add_u32_e32 v2, s9, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v0, s2, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_add_u32_e32 v2, s5, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MulMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v0, s5, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_add_u32_e32 v2, s9, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v0, s2, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_add_u32_e32 v2, s5, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -247,23 +247,23 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s2, s5 -; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s5, s6 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s4, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_and_b32 s5, s0, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s1, s4 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s4, s5 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s8, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -340,58 +340,58 @@ ; GFX9-NODL-LABEL: idot2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -469,69 +469,69 @@ ; GFX9-NODL-LABEL: idot2_MixedTypedMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedTypedMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedTypedMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -612,59 +612,59 @@ ; GFX9-NODL-LABEL: udot2_alt_AddOperands: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_alt_AddOperands: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -742,69 +742,69 @@ ; GFX9-NODL-LABEL: idot2_MixedExt: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_and_b32 s6, s3, 0xffff -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, 0xffff +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedExt: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_and_b32 s6, s3, 0xffff -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s1, 0xffff +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedExt: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_ashr_i32 s2, s0, 16 -; GFX10-DL-NEXT: s_ashr_i32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -874,38 +874,38 @@ ; GFX9-NODL-LABEL: notudot2_SameVec: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NODL-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, s2, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, s4, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, s1, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, s0, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_SameVec: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-DL-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, s2, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, s4, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, s1, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -913,19 +913,19 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, s3 -; GFX10-DL-NEXT: s_and_b32 s2, s4, 0xffff -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s1, s8 +; GFX10-DL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1006,59 +1006,59 @@ ; GFX9-NODL-LABEL: udot2_v4i16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1139,59 +1139,59 @@ ; GFX9-NODL-LABEL: udot2_v4i16_Hi: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x4 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x4 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16_Hi: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x4 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x4 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1272,72 +1272,72 @@ ; GFX9-NODL-LABEL: notudot2_v4i16_Even: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-NODL-NEXT: s_and_b32 s2, s2, s8 -; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-NODL-NEXT: s_and_b32 s0, s0, s10 +; GFX9-NODL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX9-NODL-NEXT: s_and_b32 s4, s4, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: s_and_b32 s4, s8, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Even: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-DL-NEXT: s_and_b32 s2, s2, s8 -; GFX9-DL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-DL-NEXT: s_and_b32 s0, s0, s10 +; GFX9-DL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX9-DL-NEXT: s_and_b32 s4, s4, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: s_and_b32 s4, s8, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Even: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s10, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s1, s1, s7 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 -; GFX10-DL-NEXT: s_and_b32 s0, s0, s7 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s1, v0 -; GFX10-DL-NEXT: s_and_b32 s1, s2, s7 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s10 +; GFX10-DL-NEXT: s_and_b32 s1, s1, s4 +; GFX10-DL-NEXT: s_and_b32 s5, s9, s4 +; GFX10-DL-NEXT: s_and_b32 s0, s0, s4 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s1, v0 +; GFX10-DL-NEXT: s_and_b32 s1, s8, s4 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1418,72 +1418,72 @@ ; GFX9-NODL-LABEL: notudot2_v4i16_Middle: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s8, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Middle: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: s_lshr_b32 s4, s8, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Middle: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s10, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s1, s1, s7 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s10 +; GFX10-DL-NEXT: s_and_b32 s1, s1, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s9, s4 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s1, v0 -; GFX10-DL-NEXT: s_lshr_b32 s1, s2, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s1, v0 +; GFX10-DL-NEXT: s_lshr_b32 s1, s8, 16 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1564,72 +1564,72 @@ ; GFX9-NODL-LABEL: notudot2_DiffIndex: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_DiffIndex: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_DiffIndex: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-DL-NEXT: s_and_b32 s6, s1, s2 -; GFX10-DL-NEXT: s_and_b32 s0, s0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX10-DL-NEXT: s_and_b32 s6, s1, s4 +; GFX10-DL-NEXT: s_and_b32 s0, s0, s4 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1712,75 +1712,75 @@ ; GFX9-NODL-LABEL: udot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v0 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v0 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v0 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX10-DL-NEXT: s_and_b32 s0, s0, s6 ; GFX10-DL-NEXT: s_and_b32 s1, s1, s6 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1862,72 +1862,72 @@ ; GFX9-NODL-LABEL: idot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v0 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_ashr_i32 s2, s0, 16 -; GFX10-DL-NEXT: s_ashr_i32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s1, s0, v0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2012,75 +2012,75 @@ ; GFX9-NODL-LABEL: udot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v2, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v2, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v2, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v2, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s3, s0, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s0, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s1, s4 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2163,72 +2163,72 @@ ; GFX9-NODL-LABEL: idot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v2, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v2, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v2, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s1, v2, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_sext_i32_i16 s2, s0 -; GFX10-DL-NEXT: s_sext_i32_i16 s3, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i16 s5, s1 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 16 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s4, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2314,75 +2314,75 @@ ; GFX9-NODL-LABEL: udot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX10-DL-NEXT: s_and_b32 s0, s0, s6 ; GFX10-DL-NEXT: s_and_b32 s1, s1, s6 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2465,72 +2465,72 @@ ; GFX9-NODL-LABEL: idot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_ashr_i32 s2, s0, 16 -; GFX10-DL-NEXT: s_ashr_i32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2613,14 +2613,14 @@ ; GFX9-NODL-LABEL: udot2_acc16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 @@ -2637,17 +2637,17 @@ ; GFX9-DL-LABEL: udot2_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2661,10 +2661,10 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2750,56 +2750,56 @@ ; GFX9-NODL-LABEL: notsdot2_sext8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NODL-NEXT: global_load_ushort v1, v[2:3], off -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off +; GFX9-NODL-NEXT: global_load_ushort v5, v[2:3], off +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_bfe_i32 v2, v0, 0, 8 -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v4 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 -; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v3, 8, v5 +; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v0, v4, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v1, v5, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v1, v0, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v2, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v1, v0, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notsdot2_sext8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-DL-NEXT: global_load_ushort v1, v[2:3], off -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off +; GFX9-DL-NEXT: global_load_ushort v5, v[2:3], off +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_i32 v2, v0, 0, 8 -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 -; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v3, 8, v5 +; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v0, v4, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v1, v5, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v0, v1, v0, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v2, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v1, v0, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2807,29 +2807,29 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-DL-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-DL-NEXT: global_load_ushort v1, v[2:3], off -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_ushort v4, v[0:1], off +; GFX10-DL-NEXT: global_load_ushort v5, v[2:3], off +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, v0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, v5 +; GFX10-DL-NEXT: v_bfe_i32 v2, v4, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v3, v5, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, v3, v2, s2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, v1, v0, v2 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v1, v0, s0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i8> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -72,66 +72,66 @@ ; GFX9-NODL-LABEL: idot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -250,10 +250,10 @@ ; GFX9-NODL-LABEL: idot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -281,17 +281,17 @@ ; GFX9-DL-LABEL: idot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -305,10 +305,10 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -418,14 +418,14 @@ ; GFX9-NODL-LABEL: idot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 @@ -450,17 +450,17 @@ ; GFX9-DL-LABEL: idot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -474,10 +474,10 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -579,94 +579,94 @@ ; GFX9-NODL-LABEL: idot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v0, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-DL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-DL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_sext_i32_i8 s2, s0 -; GFX10-DL-NEXT: s_sext_i32_i8 s3, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s1 ; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x80008 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80010 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x80010 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -778,64 +778,64 @@ ; GFX9-NODL-LABEL: idot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s6, s3, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s4, s2, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s6, s1, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s4, s0, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s5, s0, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX9-DL-NEXT: s_ashr_i32 s6, s3, 24 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80010 -; GFX9-DL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-DL-NEXT: s_ashr_i32 s4, s2, 24 -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x80010 -; GFX9-DL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX9-DL-NEXT: s_ashr_i32 s6, s1, 24 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s1, s1 +; GFX9-DL-NEXT: s_ashr_i32 s4, s0, 24 +; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -843,30 +843,30 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: s_sext_i32_i8 s4, s2 -; GFX10-DL-NEXT: s_sext_i32_i8 s5, s3 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s1 ; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x80010 +; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -961,14 +961,14 @@ ; GFX9-NODL-LABEL: idot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v3, 8, s5 ; GFX9-NODL-NEXT: s_bfe_i32 s5, s5, 0x80000 ; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v2, 8, s4 @@ -977,18 +977,18 @@ ; GFX9-NODL-NEXT: v_lshl_or_b32 v3, v3, 16, v5 ; GFX9-NODL-NEXT: v_and_b32_e32 v5, s4, v4 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 -; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v0, 8, s2 +; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s1, s1, 0x80000 +; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v0, 8, s0 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 -; GFX9-NODL-NEXT: v_and_b32_e32 v3, s3, v4 -; GFX9-NODL-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX9-NODL-NEXT: v_and_b32_e32 v3, s1, v4 +; GFX9-NODL-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 -; GFX9-NODL-NEXT: v_and_b32_e32 v3, s2, v4 +; GFX9-NODL-NEXT: v_and_b32_e32 v3, s0, v4 ; GFX9-NODL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v0, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u32_e32 v4, v3, v4 @@ -1001,14 +1001,14 @@ ; GFX9-DL-LABEL: idot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX9-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s5 ; GFX9-DL-NEXT: s_bfe_i32 s5, s5, 0x80000 ; GFX9-DL-NEXT: v_ashrrev_i16_e64 v2, 8, s4 @@ -1017,18 +1017,18 @@ ; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v5 ; GFX9-DL-NEXT: v_and_b32_e32 v5, s4, v4 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 -; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s3 -; GFX9-DL-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX9-DL-NEXT: v_ashrrev_i16_e64 v0, 8, s2 +; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s1 +; GFX9-DL-NEXT: s_bfe_i32 s1, s1, 0x80000 +; GFX9-DL-NEXT: v_ashrrev_i16_e64 v0, 8, s0 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s3, v4 -; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s1, v4 +; GFX9-DL-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v4 ; GFX9-DL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v4, v3, v4 @@ -1049,28 +1049,28 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s0 -; GFX10-DL-NEXT: s_bfe_i32 s0, s0, 0x80000 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000 -; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v3 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s1 -; GFX10-DL-NEXT: v_and_b32_e32 v6, s3, v3 -; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 16 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 8, s2 +; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x80000 +; GFX10-DL-NEXT: s_bfe_i32 s2, s5, 0x80000 +; GFX10-DL-NEXT: v_and_b32_e32 v7, s1, v3 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s4 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s5 +; GFX10-DL-NEXT: v_and_b32_e32 v6, s2, v3 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 16 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 16 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 -; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x80000 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 8, s0 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x80000 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s0 +; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x80000 +; GFX10-DL-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX10-DL-NEXT: v_and_b32_e32 v7, s2, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s1, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v7 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s1 ; GFX10-DL-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v7 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -74,67 +74,67 @@ ; GFX9-NODL-LABEL: udot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v0, v1 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v0, v1 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -245,14 +245,14 @@ ; GFX9-NODL-LABEL: udot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 ; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 @@ -277,17 +277,17 @@ ; GFX9-DL-LABEL: udot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -301,10 +301,10 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -415,14 +415,14 @@ ; GFX9-NODL-LABEL: udot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 @@ -447,17 +447,17 @@ ; GFX9-DL-LABEL: udot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -471,10 +471,10 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -562,14 +562,14 @@ ; GFX9-NODL-LABEL: udot2_8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 ; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 @@ -586,14 +586,14 @@ ; GFX9-DL-LABEL: udot2_8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_and_b32 s3, s2, s0 ; GFX9-DL-NEXT: s_and_b32 s0, s1, s0 @@ -617,17 +617,17 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s1, 0xff +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s0, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s3, s2, s1 -; GFX10-DL-NEXT: s_and_b32 s1, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX10-DL-NEXT: s_and_b32 s1, s5, s0 +; GFX10-DL-NEXT: s_and_b32 s0, s4, s0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x80008 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s5, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x80008 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -719,14 +719,14 @@ ; GFX9-NODL-LABEL: udot4_CommutationInsideMAD: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 ; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 @@ -751,17 +751,17 @@ ; GFX9-DL-LABEL: udot4_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s3, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s1, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -775,10 +775,10 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s1, s0, v2 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s5, s4, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -882,14 +882,14 @@ ; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 @@ -914,14 +914,14 @@ ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX9-DL-NEXT: s_and_b32 s3, s1, s0 @@ -946,7 +946,6 @@ ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s4, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -954,21 +953,22 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 -; GFX10-DL-NEXT: s_and_b32 s2, s0, s4 -; GFX10-DL-NEXT: s_and_b32 s3, s1, s4 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 +; GFX10-DL-NEXT: s_and_b32 s0, s4, s2 +; GFX10-DL-NEXT: s_and_b32 s1, s5, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1074,97 +1074,97 @@ ; GFX9-NODL-LABEL: udot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v0, v1 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v0, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v0, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v0, v1 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x80010 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 -; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s4, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s3, s0, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s0, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s1, s4 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1280,100 +1280,100 @@ ; GFX9-NODL-LABEL: udot4_multiuse_add1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v0, v1 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_add_u32_e32 v1, s10, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v2, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v2, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, v2, v0 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s0, v2, v0 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s6, v0, v1 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX9-DL-NEXT: v_add_u32_e32 v1, s10, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v2, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s8, v2, v0 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, v2, v0 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s0, v2, v0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_movk_i32 s7, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s6, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 -; GFX10-DL-NEXT: s_and_b32 s2, s0, s7 -; GFX10-DL-NEXT: s_and_b32 s3, s1, s7 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v0 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: s_and_b32 s4, s0, s6 +; GFX10-DL-NEXT: s_and_b32 s5, s1, s6 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s5, v0 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s5, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s8, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1486,10 +1486,10 @@ ; GFX9-NODL-LABEL: notdot4_mixedtypes: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1517,10 +1517,10 @@ ; GFX9-DL-LABEL: notdot4_mixedtypes: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1555,21 +1555,21 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_sext_i32_i8 s2, s0 -; GFX10-DL-NEXT: s_sext_i32_i8 s3, s1 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_sext_i32_i8 s0, s4 +; GFX10-DL-NEXT: s_sext_i32_i8 s1, s5 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1683,64 +1683,64 @@ ; GFX9-NODL-LABEL: udot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 24 -; GFX9-NODL-NEXT: s_lshr_b32 s6, s4, 24 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s3 -; GFX9-NODL-NEXT: s_and_b32 s3, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s4 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX9-NODL-NEXT: s_and_b32 s0, s0, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v2, v3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v2, v3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s7, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 24 -; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 24 -; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s3 -; GFX9-DL-NEXT: s_and_b32 s3, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s4 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX9-DL-NEXT: s_and_b32 s0, s0, s8 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v2, v3 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s7, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1748,32 +1748,32 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s6, 0xff -; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s5, 0xff +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: s_and_b32 s4, s2, s6 -; GFX10-DL-NEXT: s_and_b32 s6, s3, s6 -; GFX10-DL-NEXT: v_and_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_and_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-DL-NEXT: s_and_b32 s6, s0, s5 +; GFX10-DL-NEXT: s_and_b32 s5, s1, s5 +; GFX10-DL-NEXT: v_and_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s6, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1869,30 +1869,30 @@ ; GFX9-NODL-LABEL: udot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s7, s1, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-NODL-NEXT: v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_lshl_or_b32 v3, s6, 16, v3 ; GFX9-NODL-NEXT: v_lshl_or_b32 v4, s4, 16, v4 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v4, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-NODL-NEXT: v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v0, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 ; GFX9-NODL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v0, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u32_e32 v4, v2, v4 @@ -1905,30 +1905,30 @@ ; GFX9-DL-LABEL: udot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 24 +; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-DL-NEXT: v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_lshl_or_b32 v3, s6, 16, v3 ; GFX9-DL-NEXT: v_lshl_or_b32 v4, s4, 16, v4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v4, v3 -; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-DL-NEXT: v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-DL-NEXT: v_and_b32_sdwa v0, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 ; GFX9-DL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v0, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v4, v2, v4 @@ -1949,24 +1949,24 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s0 -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s1 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_lshr_b32 s2, s1, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s4 +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v3, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s5 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: s_lshr_b32 s0, s5, 16 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 16 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v3, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 24 +; GFX10-DL-NEXT: s_lshr_b32 s0, s5, 24 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, s1, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v3, s0, 16, v3 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, s0, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v3, s1, 16, v3 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 @@ -2090,29 +2090,29 @@ ; GFX9-NODL-LABEL: udot4_acc8_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v0, s0, v0 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v1, s0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NODL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 ; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NODL-NEXT: v_or_b32_e32 v3, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v5, v[0:1], off ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) @@ -2126,29 +2126,29 @@ ; GFX9-DL-LABEL: udot4_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v0, s0, v0 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, s0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-DL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 24 +; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 ; GFX9-DL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v5, v[0:1], off ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) @@ -2169,18 +2169,18 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 24 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s3 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s4 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s5 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 24 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s0, s1 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s4, s5 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 16 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 16 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v5 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -5,8 +5,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc32: @@ -106,84 +106,102 @@ ; GFX9-LABEL: idot8_acc32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 -; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x4000c ; GFX9-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc32: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v2, s0, s1, v0 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NOXNACK-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NOXNACK-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { entry: @@ -373,10 +391,10 @@ ; GFX9-LABEL: idot8_acc16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -424,10 +442,10 @@ ; GFX9-DL-LABEL: idot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -482,44 +500,93 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 -; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3 -; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s8, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 12 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 12 +; GFX10-DL-NEXT: s_bfe_i32 s2, s4, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s3, s5, 0x40000 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s8, s5, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s0, s5, 0x40004 ; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s7, s8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s2, v2 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s0, v2 +; GFX10-DL-NEXT: s_mov_b32 s0, 0xffff +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40010 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 -; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40018 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_ashr_i32 s0, s4, 28 +; GFX10-DL-NEXT: s_ashr_i32 s1, s5, 28 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc16: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s2, s0, 12 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s3, s1, 12 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v3, 12, s2 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v4, 12, s3 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s7, s0, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s8, s1, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s1, 0x40004 +; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e64 v5, s7, s8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v4, 12, v4 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40010 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, s4, s5, v2 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, s6, s2, v2 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40010 +; GFX10-DL-NOXNACK-NEXT: v_mad_u32_u24 v2, v3, v4, v2 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40014 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40014 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NOXNACK-NEXT: global_store_short v[0:1], v2, off +; GFX10-DL-NOXNACK-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i16 addrspace(1)* nocapture %dst) { entry: @@ -711,14 +778,14 @@ ; GFX9-LABEL: idot8_acc8: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s1, 12 ; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000 @@ -765,14 +832,14 @@ ; GFX9-DL-LABEL: idot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 12 ; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 @@ -826,44 +893,93 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 -; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3 -; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s8, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 12 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 12 +; GFX10-DL-NEXT: s_bfe_i32 s2, s4, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s3, s5, 0x40000 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s8, s5, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s0, s5, 0x40004 ; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s7, s8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s2, v2 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s0, v2 +; GFX10-DL-NEXT: s_movk_i32 s0, 0xff +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40010 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 -; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40018 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_ashr_i32 s0, s4, 28 +; GFX10-DL-NEXT: s_ashr_i32 s1, s5, 28 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc8: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s2, s0, 12 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s3, s1, 12 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v3, 12, s2 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v4, 12, s3 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s7, s0, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s8, s1, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s1, 0x40004 +; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e64 v5, s7, s8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v4, 12, v4 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40010 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, s4, s5, v2 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, s6, s2, v2 +; GFX10-DL-NOXNACK-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40010 +; GFX10-DL-NOXNACK-NEXT: v_mad_u32_u24 v2, v3, v4, v2 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40014 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40014 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NOXNACK-NEXT: global_store_byte v[0:1], v2, off +; GFX10-DL-NOXNACK-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i8 addrspace(1)* nocapture %dst) { entry: @@ -1036,143 +1152,186 @@ ; GFX9-LABEL: idot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000 ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: v_mad_i32_i24 v1, s4, v0, v1 -; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: v_mad_i32_i24 v0, s6, v2, v0 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x4000c ; GFX9-NEXT: v_mad_i32_i24 v0, s8, v2, v0 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v2, s11 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-NEXT: v_mad_i32_i24 v0, s10, v2, v0 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v0, s12, v2, v0 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v2, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v0, s14, v2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: v_mad_i32_i24 v0, s16, v2, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mad_i32_i24 v0, s0, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v0, v1 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x4000c ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 -; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40018 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s15 -; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v2, v0 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s0, v2, v0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40000 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v0 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40008 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v0 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40004 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x4000c +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40018 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v1 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40000 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v0, s2, s3, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v0 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40004 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40004 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40008 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x4000c +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x4000c +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40010 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40010 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40014 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40014 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s2, s0, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_bfe_i32 s3, s1, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NOXNACK-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v2, v0, v1 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NOXNACK-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NOXNACK-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { entry: @@ -1342,31 +1501,31 @@ ; GFX9-LABEL: idot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s2, 28 -; GFX9-NEXT: s_ashr_i32 s11, s3, 28 -; GFX9-NEXT: s_bfe_i32 s12, s3, 0x40018 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40014 -; GFX9-NEXT: s_bfe_i32 s14, s3, 0x40010 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x4000c -; GFX9-NEXT: s_bfe_i32 s16, s3, 0x40008 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40004 -; GFX9-NEXT: s_bfe_i32 s3, s3, 0x40000 -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s2, s2, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_ashr_i32 s4, s0, 28 +; GFX9-NEXT: s_ashr_i32 s11, s1, 28 +; GFX9-NEXT: s_bfe_i32 s12, s1, 0x40018 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40010 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x4000c +; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40008 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s1, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40014 +; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x4000c +; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s0, s0, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_i32_i24 v0, s2, v0, v1 +; GFX9-NEXT: v_mad_i32_i24 v0, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 @@ -1381,45 +1540,63 @@ ; GFX9-NEXT: v_mad_i32_i24 v0, s5, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-NEXT: v_mad_i32_i24 v2, s4, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v2, s0, s1, v0 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NOXNACK-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NOXNACK-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { entry: @@ -1566,26 +1743,27 @@ ; GFX9-LABEL: idot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_and_b32 s11, s2, 15 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10 -; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s8 -; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 +; GFX9-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-NEXT: s_and_b32 s11, s0, 15 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s9, s10 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s5, s8 +; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s4 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010 @@ -1594,27 +1772,27 @@ ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-NEXT: s_and_b32 s17, s6, 15 ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s17, s6 -; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s15, s16 -; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s13, s14 +; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s17, s6 +; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s15, s16 +; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s13, s14 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v1, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6 ; GFX9-NEXT: global_load_ushort v6, v[0:1], off -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s12 -; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s7, s12 +; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 @@ -1633,26 +1811,27 @@ ; GFX9-DL-LABEL: idot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s5, s8 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s3, s4 +; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s11, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s9, s10 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s8 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s1, s4 ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010 @@ -1661,27 +1840,27 @@ ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s17, s6 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s15, s16 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s13, s14 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s17, s6 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s15, s16 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s13, s14 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v1, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v0, v4 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v6 ; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s7, s12 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s7, s12 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v7 @@ -1707,52 +1886,52 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 28 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX10-DL-NEXT: s_and_b32 s8, s0, 15 -; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s9, s1, 15 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40004 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s9, s8 -; GFX10-DL-NEXT: s_bfe_u32 s9, s1, 0x4000c -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 28 +; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s3, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s8, s4, 15 +; GFX10-DL-NEXT: s_bfe_u32 s4, s4, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s9, s5, 15 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s8, s4 +; GFX10-DL-NEXT: s_bfe_u32 s8, s5, 0x40004 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s4 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s9, s8 +; GFX10-DL-NEXT: s_bfe_u32 s9, s5, 0x4000c +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s4 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s4, s5, 0x40008 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s9 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s4 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s6, s5, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s5, 0x40014 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s4 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s3, s6, s4 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s3 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s2, s3 +; GFX10-DL-NEXT: s_bfe_u32 s8, s5, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s2, s5, 28 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s8, s2 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] @@ -1765,6 +1944,75 @@ ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s3, s0, 28 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s4, s0, 0x40010 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX10-DL-NOXNACK-NEXT: s_and_b32 s8, s0, 15 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX10-DL-NOXNACK-NEXT: s_and_b32 s9, s1, 15 +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s0, s8, s0 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s8, s1, 0x40004 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s0, s9, s8 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s9, s1, 0x4000c +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s0, s1, 0x40008 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s6, s1, 0x40010 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s0, s1, 0x40014 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v3, v3, v4 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v6, 12, s4 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NOXNACK-NEXT: s_bfe_u32 s8, s1, 0x40018 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s0, s1, 28 +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s1, s2, s3 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_pack_ll_b32_b16 s0, s8, s0 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v3, v3, v5 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NOXNACK-NEXT: global_store_short v[0:1], v2, off +; GFX10-DL-NOXNACK-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i16 addrspace(1)* nocapture %dst) { entry: @@ -1954,14 +2202,14 @@ ; GFX9-LABEL: idot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s1, 4 ; GFX9-NEXT: s_lshr_b32 s14, s2, 4 @@ -2042,14 +2290,14 @@ ; GFX9-DL-LABEL: idot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 4 ; GFX9-DL-NEXT: s_lshr_b32 s14, s2, 4 @@ -2137,24 +2385,24 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 4 -; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 4 +; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 4 +; GFX10-DL-NEXT: s_lshr_b32 s14, s5, 4 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s14 -; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 12 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX10-DL-NEXT: s_lshr_b32 s8, s4, 12 +; GFX10-DL-NEXT: s_lshr_b32 s15, s5, 12 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13 -; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 8 -; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 8 +; GFX10-DL-NEXT: s_lshr_b32 s9, s4, 8 +; GFX10-DL-NEXT: s_lshr_b32 s16, s5, 8 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s9 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 @@ -2167,31 +2415,31 @@ ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v7 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v19, v14 -; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 20 -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s10, s1, 20 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 20 +; GFX10-DL-NEXT: s_lshr_b32 s2, s4, 16 +; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 24 +; GFX10-DL-NEXT: s_lshr_b32 s10, s5, 20 ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s3 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s1 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s10 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v12 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 -; GFX10-DL-NEXT: s_lshr_b32 s11, s1, 16 -; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 28 +; GFX10-DL-NEXT: s_lshr_b32 s11, s5, 16 +; GFX10-DL-NEXT: s_lshr_b32 s12, s5, 28 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s11 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v9 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v10 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s12 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v11 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v13 -; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 24 +; GFX10-DL-NEXT: s_lshr_b32 s13, s5, 24 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s13 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v16 @@ -2210,7 +2458,7 @@ ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 @@ -2220,6 +2468,100 @@ ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm +; +; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul: +; GFX10-DL-NOXNACK: ; %bb.0: ; %entry +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s7, s0, 4 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s14, s1, 4 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v13, 12, s14 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s8, s0, 12 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s15, s1, 12 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v3, 12, s0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v14, 12, s15 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v6, 12, s8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v13, 12, v13 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s9, s0, 8 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s16, s1, 8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v5, 12, s9 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v12, 12, s16 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v4, 12, v4 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v19, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v7, v7, v13 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v14, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v12, 12, v12 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v3, v3, v4 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v6, 8, v7 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v4, v19, v14 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s3, s0, 20 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s5, s0, 28 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s6, s0, 24 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s10, s1, 20 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v8, 12, s6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v10, 12, s4 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v11, 12, s3 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v13, 12, s10 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v5, v5, v12 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v4, 8, v4 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s11, s1, 16 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s12, s1, 28 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v7, 12, s11 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v6, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v8, 12, v9 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v9, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v16, 12, s12 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v5, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v10, 12, v13 +; GFX10-DL-NOXNACK-NEXT: s_lshr_b32 s13, s1, 24 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v15, 12, s13 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v11, 12, v16 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v5, v5, v10 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v10, v9, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v12, 12, v15 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v8, v8, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v3, 8, v5 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v5, v6, v12 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v6, 8, v8 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v2, v2, v9 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_e32 v4, v3, v5 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 8, v4 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NOXNACK-NEXT: global_store_byte v[0:1], v2, off +; GFX10-DL-NOXNACK-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i8 addrspace(1)* nocapture %dst) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -104,11 +104,12 @@ ; GFX9-LABEL: udot8_acc32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 @@ -118,17 +119,17 @@ ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 @@ -142,44 +143,44 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -351,10 +352,10 @@ ; GFX9-LABEL: udot8_acc16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -399,10 +400,10 @@ ; GFX9-DL-LABEL: udot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -454,34 +455,34 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_and_b32 s0, s4, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s5, 15 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40008 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -654,10 +655,10 @@ ; GFX9-LABEL: udot8_acc8: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -702,10 +703,10 @@ ; GFX9-DL-LABEL: udot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -757,34 +758,34 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_and_b32 s0, s4, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s5, 15 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40008 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -961,10 +962,10 @@ ; GFX9-LABEL: udot8_acc4: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1012,10 +1013,10 @@ ; GFX9-DL-LABEL: udot8_acc4: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1070,36 +1071,36 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s0, s4, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s5, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s3, s5 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x4000c +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s1, s3 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s2, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1261,10 +1262,10 @@ ; GFX9-LABEL: udot8_CommutationInsideMAD: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1312,10 +1313,10 @@ ; GFX9-DL-LABEL: udot8_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1370,36 +1371,36 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40008 +; GFX10-DL-NEXT: s_and_b32 s0, s4, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s5, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x4000c +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s0, s1 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s2, s3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1558,11 +1559,12 @@ ; GFX9-LABEL: udot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 @@ -1572,19 +1574,19 @@ ; GFX9-NEXT: s_bfe_u32 s15, s6, 0x4000c ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1 @@ -1597,21 +1599,22 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s1, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-DL-NEXT: s_lshr_b32 s7, s6, 28 @@ -1621,19 +1624,19 @@ ; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x4000c ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-DL-NEXT: s_and_b32 s6, s6, 15 -; GFX9-DL-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 +; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v2, v1 @@ -1646,34 +1649,34 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_and_b32 s4, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s5, s1, 15 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v0 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c @@ -1691,8 +1694,8 @@ ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1863,11 +1866,12 @@ ; GFX9-LABEL: udot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 @@ -1877,17 +1881,17 @@ ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 @@ -1901,44 +1905,44 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2085,45 +2089,46 @@ ; GFX9-LABEL: udot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX9-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 ; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010 ; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, s3, v0 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s13, s14 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, s1, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s13, s14 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40014 ; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-NEXT: s_and_b32 s17, s6, 15 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s8 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s15, s16 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s15, s16 ; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v0 -; GFX9-NEXT: s_and_b32 s11, s2, 15 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s11, s0, 15 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s9, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s17, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s17, s6 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, s4, v0 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v6, v5, v6 @@ -2140,45 +2145,46 @@ ; GFX9-DL-LABEL: udot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX9-DL-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 ; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010 ; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s3, v0 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s13, s14 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s1, v0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s13, s14 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40014 ; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40008 ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s5, s8 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x4000c ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s15, s16 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s15, s16 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v0 -; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: s_and_b32 s11, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s9, s10 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s17, s6 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s17, s6 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s4, v0 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s0, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v6, v5, v6 @@ -2202,42 +2208,42 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40004 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s3 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-DL-NEXT: s_and_b32 s0, s4, 15 +; GFX10-DL-NEXT: s_bfe_u32 s3, s4, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s1, s5, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x40004 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x40008 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s1 +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x4000c ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40014 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s2, s4 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40014 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x40014 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s3, s1 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s3, s5, 28 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s4 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x40018 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 @@ -2425,14 +2431,14 @@ ; GFX9-LABEL: udot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s3, s1, 0x40010 ; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 @@ -2491,14 +2497,14 @@ ; GFX9-DL-LABEL: udot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 ; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 @@ -2564,45 +2570,45 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s3, s5 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s2, s3 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s5 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s0, s4, 15 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s1, s3 +; GFX10-DL-NEXT: s_and_b32 s1, s5, 15 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x4000c +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s3 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s2, s1, 0x40008 -; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s4, s2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s5, 0x40008 +; GFX10-DL-NEXT: s_mov_b32 s1, 0xffff +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s0 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v3 +; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s5, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s1, v3 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s6 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s6 +; GFX10-DL-NEXT: s_bfe_u32 s3, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s7, s5, 0x40010 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 28 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s7 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s4 +; GFX10-DL-NEXT: s_lshr_b32 s2, s5, 28 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s0, s7 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s4, s2 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4 -; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40018 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, s5, s0 +; GFX10-DL-NEXT: s_bfe_u32 s0, s5, 0x40018 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, s3, s0 ; GFX10-DL-NEXT: v_or_b32_e32 v5, v6, v5 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 8, v7 -; GFX10-DL-NEXT: v_and_b32_e32 v5, s3, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v5, s1, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v10, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -2754,10 +2760,10 @@ ; GFX9-LABEL: udot8_acc4_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2805,10 +2811,10 @@ ; GFX9-DL-LABEL: udot8_acc4_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2863,36 +2869,36 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s0, s4, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s5, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s3, s5 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x4000c +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s1, s3 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s2, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off @@ -3025,33 +3031,33 @@ ; GFX9-LABEL: udot8_variant1: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, 15 -; GFX9-NEXT: s_and_b32 s5, s3, 15 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s16, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: s_and_b32 s4, s0, 15 +; GFX9-NEXT: s_and_b32 s5, s1, 15 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: v_mad_u32_u24 v0, s5, v0, v1 -; GFX9-NEXT: s_bfe_u32 s7, s3, 0x40004 -; GFX9-NEXT: s_bfe_u32 s9, s3, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40018 -; GFX9-NEXT: s_lshr_b32 s3, s3, 28 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x40004 +; GFX9-NEXT: s_bfe_u32 s9, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s17, s1, 0x40018 +; GFX9-NEXT: s_lshr_b32 s1, s1, 28 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mad_u32_u24 v0, s1, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3064,43 +3070,43 @@ ; GFX9-NEXT: v_mad_u32_u24 v0, s15, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: v_mad_u32_u24 v2, s17, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_variant1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_variant1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm i32 addrspace(1)* %v2addr, diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s ; FIXME: Merge into imm.ll diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1,7 +1,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,IDXMODE-NO-XNACK,PREGFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,IDXMODE-XNACK,GFX9 %s ; Tests for indirect addressing on SI, which is implemented using dynamic ; indexing of vectors. @@ -273,7 +273,8 @@ ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} ; MOVREL: v_movreld_b32_e32 v0, 5 -; IDXMODE: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0xfffffe00{{$}} +; IDXMODE-NO-XNACK: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0xfffffe00{{$}} +; IDXMODE-XNACK: s_addk_i32 s{{[0-9]+}}, 0xfe00{{$}} ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(DST) ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 ; IDXMODE-NEXT: s_set_gpr_idx_off diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -53,7 +53,7 @@ ; GCN-NEXT: is_ptr64 = 1 ; GCN-NEXT: is_dynamic_callstack = 1 ; GCN-NEXT: is_debug_enabled = 0 -; GCN-NEXT: is_xnack_enabled = 1 +; GCN-NEXT: is_xnack_enabled = 0 ; GCN-NEXT: workitem_private_segment_byte_size = 16384 ; GCN-NEXT: workgroup_group_segment_byte_size = 0 ; GCN-NEXT: gds_segment_byte_size = 0 @@ -149,7 +149,7 @@ ; GCN-NEXT: is_ptr64 = 1 ; GCN-NEXT: is_dynamic_callstack = 1 ; GCN-NEXT: is_debug_enabled = 0 -; GCN-NEXT: is_xnack_enabled = 1 +; GCN-NEXT: is_xnack_enabled = 0 ; GCN-NEXT: workitem_private_segment_byte_size = 16384 ; GCN-NEXT: workgroup_group_segment_byte_size = 0 ; GCN-NEXT: gds_segment_byte_size = 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -41,13 +41,13 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s0, s4, s0 +; GFX9-NEXT: s_pack_lh_b32_b16 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -93,14 +93,14 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s0 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: ;;#ASMSTART @@ -160,13 +160,13 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s0, s4, s0 +; GFX9-NEXT: s_pack_hh_b32_b16 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -214,11 +214,11 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: s_lshr_b32 s0, s6, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_lh_b32_b16 s1, s0, s2 @@ -280,12 +280,12 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s1, s0 @@ -394,13 +394,13 @@ ; GFX9-LABEL: s_insertelement_v2i16_1_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -572,12 +572,12 @@ ; GFX9-LABEL: v_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s4 +; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -1049,16 +1049,16 @@ ; GFX9-LABEL: s_insertelement_v2i16_dynamic: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 ; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX9-NEXT: s_andn2_b32 s1, s2, s0 +; GFX9-NEXT: s_andn2_b32 s1, s4, s0 ; GFX9-NEXT: s_and_b32 s0, s0, 0x3e703e7 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 @@ -1113,12 +1113,12 @@ ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: s_lshl_b32 s2, s4, 4 +; GFX9-NEXT: s_lshl_b32 s2, s6, 4 ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 @@ -1180,10 +1180,10 @@ ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1263,13 +1263,13 @@ ; GFX9-LABEL: v_insertelement_v4f16_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v0, v3, s4, v0 +; GFX9-NEXT: v_bfi_b32 v0, v3, s6, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1328,13 +1328,13 @@ ; GFX9-LABEL: v_insertelement_v4f16_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, s4, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, s6, 16, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1393,13 +1393,13 @@ ; GFX9-LABEL: v_insertelement_v4f16_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v3, s4, v1 +; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1458,13 +1458,13 @@ ; GFX9-LABEL: v_insertelement_v4f16_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1523,13 +1523,13 @@ ; GFX9-LABEL: v_insertelement_v4i16_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v3, s4, v1 +; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1590,7 +1590,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v2, v[0:1], off ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] @@ -1599,7 +1599,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s6, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, v3, s2, v1 ; GFX9-NEXT: v_bfi_b32 v0, v2, s2, v0 @@ -1678,17 +1678,17 @@ ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6 ; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_lshl_b32 s5, s5, 4 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_lshl_b32 s4, s7, 4 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1 ; GFX9-NEXT: v_bfi_b32 v0, s2, v4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 { ; SI-LABEL: s_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s2, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s4, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_cvt_pkrtz_v2f16_f32: @@ -31,10 +31,10 @@ ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -47,13 +47,13 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 { ; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s2, s2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s4, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: @@ -70,10 +70,10 @@ ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -94,21 +94,21 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, v2, v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32: @@ -136,11 +136,11 @@ ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -268,21 +268,21 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: @@ -310,11 +310,11 @@ ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -335,21 +335,21 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: @@ -377,11 +377,11 @@ ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -402,21 +402,21 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: @@ -444,11 +444,11 @@ ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -470,21 +470,21 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: @@ -512,11 +512,11 @@ ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll @@ -32,8 +32,9 @@ } ; GCN-LABEL: {{^}}sample_contig_nsa: -; GCN: image_sample_c_l v0, v[0:7], -; NSA: image_sample v1, [v6, v7, v5], +; NONSA: image_sample_c_l v0, v[0:7], +; NSA: image_sample_c_l v8, v[0:7], +; NSA: image_sample v9, [v6, v7, v5], define amdgpu_ps <2 x float> @sample_contig_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -44,8 +45,8 @@ } ; GCN-LABEL: {{^}}sample_nsa_nsa: -; NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0], -; NSA: image_sample v1, [v6, v7, v5], +; NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0], +; NSA: image_sample v9, [v6, v7, v5], define amdgpu_ps <2 x float> @sample_nsa_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %r2, float %s2, float %t2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -56,8 +57,8 @@ } ; GCN-LABEL: {{^}}sample_nsa_contig: -; NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0], -; NSA: image_sample v1, v[5:7], +; NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0], +; NSA: image_sample v9, v[5:7], define amdgpu_ps <2 x float> @sample_nsa_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %s2, float %t2, float %r2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -68,8 +69,9 @@ } ; GCN-LABEL: {{^}}sample_contig_contig: -; GCN: image_sample_c_l v0, v[0:7], -; NSA: image_sample v1, v[5:7], +; NSA: image_sample_c_l v8, v[0:7], +; NSA: image_sample v9, v[5:7], +; NONSA: image_sample_c_l v0, v[0:7], ; NONSA: image_sample v1, v[5:7], define amdgpu_ps <2 x float> @sample_contig_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %s2, float %t2, float %r2) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -8,7 +8,7 @@ ; VARIANT0-LABEL: test_barrier: ; VARIANT0: ; %bb.0: ; %entry ; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; VARIANT0-NEXT: s_load_dword s2, s[0:1], 0xb +; VARIANT0-NEXT: s_load_dword s0, s[0:1], 0xb ; VARIANT0-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT0-NEXT: s_mov_b32 s6, 0 ; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -18,7 +18,7 @@ ; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VARIANT0-NEXT: s_barrier -; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s2, v3 +; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s0, v3 ; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 @@ -29,7 +29,7 @@ ; VARIANT1-LABEL: test_barrier: ; VARIANT1: ; %bb.0: ; %entry ; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; VARIANT1-NEXT: s_load_dword s2, s[0:1], 0xb +; VARIANT1-NEXT: s_load_dword s0, s[0:1], 0xb ; VARIANT1-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT1-NEXT: s_mov_b32 s6, 0 ; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -38,7 +38,7 @@ ; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_barrier -; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s2, v3 +; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s0, v3 ; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT1-NEXT: s_waitcnt expcnt(0) @@ -50,11 +50,11 @@ ; VARIANT2-LABEL: test_barrier: ; VARIANT2: ; %bb.0: ; %entry ; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c +; VARIANT2-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT2-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] -; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s0 +; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT2-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT2-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VARIANT2-NEXT: v_mov_b32_e32 v3, s3 @@ -70,11 +70,11 @@ ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry ; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c +; VARIANT3-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT3-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] -; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s0 +; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT3-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT3-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VARIANT3-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -5,14 +5,14 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_bfe_u32 v0, v0, s3, s3 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_bfe_u32 v0, v0, s5, s5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_arg_arg_arg: @@ -34,15 +34,15 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_arg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfe_u32 v0, s2, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_bfe_u32 v0, s4, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_arg_arg_imm: @@ -65,15 +65,15 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_imm_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfe_u32 v0, s2, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_bfe_u32 v0, s4, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_arg_imm_arg: @@ -96,16 +96,16 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_imm_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s0, 0x7b +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_movk_i32 s6, 0x7b ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfe_u32 v0, s0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_bfe_u32 v0, s6, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_imm_arg_arg: @@ -1590,13 +1590,13 @@ ; SI-LABEL: lshr_and: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s2, 0x30006 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: lshr_and: @@ -1619,15 +1619,15 @@ define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; SI-LABEL: v_lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s0, s2, s3 -; SI-NEXT: s_and_b32 s0, s0, 7 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_lshr_b32 s2, s4, s5 +; SI-NEXT: s_and_b32 s4, s2, 7 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_and: @@ -1652,13 +1652,13 @@ ; SI-LABEL: and_lshr: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s2, 0x30006 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: and_lshr: @@ -1682,13 +1682,13 @@ ; SI-LABEL: and_lshr2: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s2, 0x30006 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: and_lshr2: @@ -1712,13 +1712,13 @@ ; SI-LABEL: shl_lshr: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s2, 0x150002 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bfe_u32 s4, s2, 0x150002 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_lshr: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -292,17 +292,17 @@ ; GFX9-LABEL: maxnum_v2f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s6, s6 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -529,12 +529,12 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: v_pk_max_f16 v0, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 @@ -643,12 +643,12 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -315,17 +315,17 @@ ; GFX9-LABEL: minnum_v2f16_ieee: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s6, s6 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -582,12 +582,12 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: v_pk_max_f16 v0, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 @@ -696,12 +696,12 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -40,15 +40,15 @@ ; GCN-NEXT: v_mov_b32_e32 v1, 0x3000 ; GCN-NEXT: v_add_u32_e32 v1, 0x20d0, v1 ; GCN-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3 +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v5, vcc ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GCN-NEXT: s_endpgm @@ -93,15 +93,15 @@ ; GCN-NEXT: v_add_u32_e32 v3, 0x1000, v3 ; GCN-NEXT: v_add_u32_e32 v3, 0x20d0, v3 ; GCN-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:4 ; GCN-NEXT: s_sub_u32 s32, s32, 0x180000 ; GCN-NEXT: s_mov_b32 s33, s5 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v4, v5 +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc ; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -10,10 +10,10 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, s5, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -141,12 +141,12 @@ ; GFX9-LABEL: lshr_v_s_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -210,12 +210,12 @@ ; GFX9-LABEL: lshr_s_v_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s0 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -29,11 +29,11 @@ ; GFX9-LABEL: v_test_imax_sge_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -79,11 +79,11 @@ ; GFX9-LABEL: v_test_imax_sge_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -140,14 +140,17 @@ ; GFX9-LABEL: v_test_imax_sge_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_short_d16 v1, v0, s[0:1] offset:4 -; GFX9-NEXT: global_load_dword v3, v0, s[0:1] +; GFX9-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v4, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_pk_max_i16 v1, v2, v1 @@ -200,11 +203,11 @@ ; GFX9-LABEL: v_test_imax_sge_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v1, v3 ; GFX9-NEXT: v_pk_max_i16 v0, v0, v2 @@ -249,11 +252,11 @@ ; GFX9-LABEL: v_test_imax_sgt_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -297,11 +300,11 @@ ; GFX9-LABEL: v_test_umax_uge_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -345,11 +348,11 @@ ; GFX9-LABEL: v_test_umax_ugt_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -394,11 +397,11 @@ ; GFX9-LABEL: v_test_umax_ugt_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_u16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -495,8 +495,8 @@ ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: ; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} ; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} -; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] -; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; GCN: s_min_u32 [[MIN:s[0-9]+]], s{{[0-9]}}, s{{[0-9]}} +; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], s{{[0-9]}} ; GCN: buffer_store_dword [[VMIN]] ; EG: MIN_UINT diff --git a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll --- a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll +++ b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -enable-misched=0 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack -verify-machineinstrs -enable-misched=0 < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}sample_contig_nsa: ; GCN-DAG: image_sample_c_l v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], diff --git a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir --- a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: hazard_image_sample_d_buf_off6 # GCN: IMAGE_SAMPLE diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll b/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll --- a/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll @@ -47,10 +47,10 @@ ; GCN-LABEL: reassoc_v2i32: ; GCN: s_add_i32 [[ADD1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_add_i32 [[ADD2:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} -; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}} +; GCN-DAG: s_add_i32 [[ADD2:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} +; GFX8-DAG: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}} ; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD2]], v{{[0-9]+}} -; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}} +; GFX9-DAG: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}} ; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD2]], v{{[0-9]+}} define amdgpu_kernel void @reassoc_v2i32(<2 x i32> addrspace(1)* %arg, <2 x i32> %x, <2 x i32> %y) { bb: diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -58,18 +58,18 @@ ; GFX9-LABEL: saddo_i64_zext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: s_add_u32 s2, s6, s0 -; GFX9-NEXT: s_addc_u32 s3, s7, s1 +; GFX9-NEXT: s_add_u32 s0, s6, s2 +; GFX9-NEXT: s_addc_u32 s1, s7, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[1:2] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc @@ -133,12 +133,12 @@ ; GFX9-LABEL: s_saddo_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_add_i32 v4, s0, v4 clamp -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: s_add_i32 s0, s2, s3 +; GFX9-NEXT: v_add_i32 v4, s2, v4 clamp ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v5, s0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v4 @@ -387,21 +387,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[8:9], off +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v7, vcc +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[4:5] +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: global_store_byte v[6:7], v0, off +; GFX9-NEXT: global_store_byte v[2:3], v0, off ; GFX9-NEXT: s_endpgm %a = load i64, i64 addrspace(1)* %aptr, align 4 %b = load i64, i64 addrspace(1)* %bptr, align 4 @@ -486,23 +486,23 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_i32 v8, v0, v2 clamp -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_add_i32 v2, v1, v3 clamp -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_add_i32 v8, v4, v6 clamp +; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 +; GFX9-NEXT: v_add_i32 v6, v5, v7 clamp +; GFX9-NEXT: v_add_u32_e32 v5, v5, v7 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v5, v6 +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -223,13 +223,13 @@ define amdgpu_kernel void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind { ; SI-LABEL: scalar_to_vector_test6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: scalar_to_vector_test6: diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll --- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll @@ -8,15 +8,15 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s8, s[2:3], 0x0 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, 1, s8 +; GCN-NEXT: s_lshr_b32 s0, 1, s2 ; GCN-NEXT: s_ff1_i32_b32 s0, s0 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], s8, 0 +; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, 0 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[2:3] ; GCN-NEXT: v_ffbh_i32_e32 v1, v0 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll --- a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}const_load_no_shrink_dword_to_unaligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013 define amdgpu_kernel void @const_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) { @@ -14,6 +15,7 @@ } ; GCN-LABEL: const_load_no_shrink_dword_to_aligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003 define amdgpu_kernel void @const_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) { @@ -27,6 +29,7 @@ } ; GCN-LABEL: global_load_no_shrink_dword_to_unaligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013 define amdgpu_kernel void @global_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %x) { @@ -40,6 +43,7 @@ } ; GCN-LABEL: global_load_no_shrink_dword_to_aligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003 define amdgpu_kernel void @global_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %x) { diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,12 +8,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -140,12 +140,12 @@ ; GFX9-LABEL: shl_v_s_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, s0, v1 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, s2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -209,12 +209,12 @@ ; GFX9-LABEL: shl_s_v_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s0 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -150,8 +150,8 @@ ; GCN-LABEL: {{^}}smrd_load_const1: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff glc ; encoding: [0xff -; VIGFX9_10-DAG: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc ; -; VIGFX9_10-DAG: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc glc ; +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x3fc ; +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x3fc glc ; define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 @@ -173,8 +173,8 @@ ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x400 +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x400 define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 @@ -194,8 +194,8 @@ ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0xffffc +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0xffffc define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 @@ -211,8 +211,8 @@ ; SMRD load with an offset greater than the largest possible immediate on VI ; GCN-LABEL: {{^}}smrd_load_const4: ; SIVIGFX9_10: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 -; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] -; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]:[0-9]+}}], [[OFFSET]] +; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]:[0-9]+}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -137,7 +137,7 @@ ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; GCN: NumVgprs: 10 -; GFX900: ScratchSize: 44 +; GFX900: ScratchSize: 52 ; GFX908: ScratchSize: 20 ; GCN: VGPRBlocks: 2 ; GCN: NumVGPRsForWavesPerEU: 10 @@ -246,7 +246,7 @@ ; GFX908-DAG: v_accvgpr_read_b32 ; GCN: NumVgprs: 256 -; GFX900: ScratchSize: 2052 +; GFX900: ScratchSize: 1028 ; GFX908-FIXME: ScratchSize: 0 ; GCN: VGPRBlocks: 63 ; GCN: NumVGPRsForWavesPerEU: 256 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir @@ -308,12 +308,12 @@ ; VR: renamable $sgpr8 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr12, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr9 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr13, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr14 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr15, 0, 0 :: (dereferenceable invariant load 4) - ; VR: renamable $sgpr15 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr16, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr10_sgpr11 = IMPLICIT_DEF + ; VR: renamable $sgpr17 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr22, 0, 0 :: (dereferenceable invariant load 4) + ; VR: renamable $sgpr15 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr16, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr12 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr18, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr13 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr19, 0, 0 :: (dereferenceable invariant load 4) ; VR: renamable $sgpr16 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr21, 0, 0 :: (dereferenceable invariant load 4) - ; VR: renamable $sgpr17 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr22, 0, 0 :: (dereferenceable invariant load 4) ; VR: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, implicit killed renamable $sgpr10_sgpr11, implicit killed renamable $sgpr8, implicit killed renamable $sgpr9, implicit killed renamable $sgpr12, implicit killed renamable $sgpr13, implicit killed renamable $sgpr14, implicit killed renamable $sgpr15, implicit killed renamable $sgpr16, implicit killed renamable $sgpr17 %0:sgpr_128 = IMPLICIT_DEF %1:sreg_64 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -10,56 +10,61 @@ ; CHECK: bb.0..expVert: ; CHECK: liveins: $sgpr3, $sgpr4, $sgpr5, $sgpr8, $sgpr9, $sgpr10, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr25, $sgpr27, $sgpr31 ; CHECK: undef %56.sub0:sgpr_64 = COPY $sgpr31 - ; CHECK: SI_SPILL_S32_SAVE $sgpr27, %stack.2, implicit $exec, implicit $sgpr100_sgpr101_sgpr102_sgpr103, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) - ; CHECK: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr25 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; CHECK: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr18 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr27 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr25 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr18 ; CHECK: undef %50.sub0:sgpr_64 = COPY $sgpr19 - ; CHECK: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr20 - ; CHECK: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr21 - ; CHECK: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr22 - ; CHECK: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr23 - ; CHECK: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr9 - ; CHECK: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr10 - ; CHECK: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr8 + ; CHECK: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr20 + ; CHECK: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr21 + ; CHECK: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr22 + ; CHECK: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr23 + ; CHECK: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr9 + ; CHECK: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr10 + ; CHECK: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr8 ; CHECK: undef %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %56, 232, 0, 0 :: (load 8 from %ir.40, addrspace 4) ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; CHECK: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc - ; CHECK: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc - ; CHECK: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc ; CHECK: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_]], 31, implicit-def dead $scc ; CHECK: [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc - ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc - ; CHECK: SI_SPILL_S32_SAVE [[S_AND_B32_]], %stack.0, implicit $exec, implicit $sgpr100_sgpr101_sgpr102_sgpr103, implicit $sgpr32 :: (store 4 into %stack.0, addrspace 5) + ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc ; CHECK: [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc - ; CHECK: undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY4]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK: undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %130.sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0, 0 :: (load 16 from %ir.84, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0, 0 :: (load 16 from `<4 x i32> addrspace(4)* undef`, addrspace 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %132:sgpr_128, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: early-clobber %73:sgpr_128, early-clobber %143:sgpr_128, early-clobber %131:sreg_32_xm0_xexec = BUNDLE %130, undef %132:sgpr_128, undef %74:sreg_64 { + ; CHECK: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0, 0 :: (load 16 from %ir.84, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0, 0 :: (load 16 from `<4 x i32> addrspace(4)* undef`, addrspace 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %132:sgpr_128, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: } ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK: %71.sub3:sgpr_128 = S_MOV_B32 553734060 ; CHECK: %71.sub2:sgpr_128 = S_MOV_B32 -1 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: SI_SPILL_S128_SAVE %71, %stack.1, implicit $exec, implicit $sgpr100_sgpr101_sgpr102_sgpr103, implicit $sgpr32 :: (store 16 into %stack.1, align 4, addrspace 5) + ; CHECK: early-clobber %87:vgpr_32, early-clobber %117:vgpr_32, early-clobber %76:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM1]], undef %118:sgpr_128, undef %89:sgpr_128, [[V_MOV_B32_e32_]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: } + ; CHECK: [[COPY13:%[0-9]+]]:sgpr_128 = COPY %71 ; CHECK: %71.sub1:sgpr_128 = S_MOV_B32 0 ; CHECK: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 29, implicit-def dead $scc ; CHECK: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc ; CHECK: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc - ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY4]], 64, implicit-def $scc + ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY5]], 64, implicit-def $scc ; CHECK: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %54:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK: undef %149.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %149.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc ; CHECK: undef %156.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK: %156.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc ; CHECK: undef %163.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0, 0 :: (load 16 from %ir.91, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0, 0 :: (load 16 from %ir.97, addrspace 4) + ; CHECK: early-clobber %150:sgpr_128, early-clobber %157:sgpr_128 = BUNDLE %149, %156 { + ; CHECK: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0, 0 :: (load 16 from %ir.91, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0, 0 :: (load 16 from %ir.97, addrspace 4) + ; CHECK: } ; CHECK: %163.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %171:sreg_32, 31, implicit-def dead $scc ; CHECK: undef %176.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], undef %171:sreg_32, implicit-def $scc @@ -87,55 +92,67 @@ ; CHECK: %253.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: undef %261.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], undef %171:sreg_32, implicit-def $scc ; CHECK: %261.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %273.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK: undef %273.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %273.sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %286.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK: undef %286.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK: %286.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %293.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK: undef %293.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %293.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc ; CHECK: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, undef %314:sreg_32, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 16, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %369:sgpr_128, undef %370:sreg_32, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %380:sgpr_128, 16, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0, 0 :: (load 16 from %ir.111, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0, 0 :: (load 16 from %ir.117, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0, 0 :: (load 16 from %ir.123, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0, 0 :: (load 16 from %ir.131, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0, 0 :: (load 16 from %ir.138, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %364:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %375:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: early-clobber %379:sreg_32_xm0_xexec, early-clobber %201:sgpr_128, early-clobber %177:sgpr_128, early-clobber %184:sgpr_128, early-clobber %319:sreg_32_xm0_xexec, early-clobber %191:sgpr_128, early-clobber %309:sreg_32_xm0_xexec, early-clobber %323:sreg_32_xm0_xexec, early-clobber %368:sreg_32_xm0_xexec, early-clobber %313:sreg_32_xm0_xexec, early-clobber %211:sgpr_128 = BUNDLE [[S_ADD_I32_]], %71, undef %369:sgpr_128, %210, undef %314:sreg_32, %200, undef %380:sgpr_128, %176, %183, [[S_ADD_I32_1]], %190, undef %370:sreg_32 { + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, undef %314:sreg_32, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 16, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %369:sgpr_128, undef %370:sreg_32, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %380:sgpr_128, 16, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0, 0 :: (load 16 from %ir.111, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0, 0 :: (load 16 from %ir.117, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0, 0 :: (load 16 from %ir.123, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0, 0 :: (load 16 from %ir.131, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0, 0 :: (load 16 from %ir.138, addrspace 4) + ; CHECK: } + ; CHECK: early-clobber %151:vgpr_32, early-clobber %158:vgpr_32, early-clobber %165:vgpr_32 = BUNDLE [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], [[S_LOAD_DWORDX4_IMM3]], [[S_LOAD_DWORDX4_IMM4]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: } + ; CHECK: early-clobber %374:sreg_32_xm0_xexec, early-clobber %363:sreg_32_xm0_xexec = BUNDLE [[S_ADD_I32_]], undef %364:sgpr_128, undef %375:sgpr_128, [[S_ADD_I32_1]] { + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %364:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %375:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: } ; CHECK: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR]], -98, implicit-def dead $scc ; CHECK: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR1]], -114, implicit-def dead $scc ; CHECK: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR2]], -130, implicit-def dead $scc ; CHECK: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc - ; CHECK: undef %327.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK: undef %327.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %327.sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %335.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK: undef %335.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %335.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %343.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK: undef %343.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK: %343.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %351.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK: undef %351.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %351.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY9]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY10]], 4, implicit-def dead $scc ; CHECK: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %396:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0, 0 :: (load 16 from %ir.155, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0, 0 :: (load 16 from %ir.144, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0, 0 :: (load 16 from %ir.150, addrspace 4) + ; CHECK: early-clobber %218:sgpr_128, early-clobber %225:sgpr_128, early-clobber %231:sgpr_128 = BUNDLE %217, %224, %50 { + ; CHECK: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0, 0 :: (load 16 from %ir.155, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0, 0 :: (load 16 from %ir.144, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0, 0 :: (load 16 from %ir.150, addrspace 4) + ; CHECK: } ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0, 0 :: (load 16 from %ir.162, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0, 0 :: (load 16 from %ir.170, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: early-clobber %254:sgpr_128, early-clobber %242:sgpr_128 = BUNDLE %253, %241 { + ; CHECK: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0, 0 :: (load 16 from %ir.162, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0, 0 :: (load 16 from %ir.170, addrspace 4) + ; CHECK: } + ; CHECK: early-clobber %212:vgpr_32, early-clobber %202:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM8]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: } ; CHECK: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -217, implicit-def dead $scc ; CHECK: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -233, implicit-def dead $scc ; CHECK: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR5]], -249, implicit-def dead $scc @@ -144,35 +161,41 @@ ; CHECK: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -329, implicit-def dead $scc ; CHECK: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -345, implicit-def dead $scc ; CHECK: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR6]], -441, implicit-def dead $scc - ; CHECK: [[S_ADD_U32_3:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 160, implicit-def $scc + ; CHECK: [[S_ADD_U32_3:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], 160, implicit-def $scc ; CHECK: [[S_ADDC_U32_3:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %36:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK: undef %411.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %411.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY10]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY11]], 4, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc ; CHECK: undef %425.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_4]], implicit-def $scc ; CHECK: %425.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_ADD_U32_4:%[0-9]+]]:sreg_32 = S_ADD_U32 %56.sub0, 168, implicit-def $scc ; CHECK: [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %57:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc ; CHECK: undef %441.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_5]], implicit-def $scc ; CHECK: %441.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %441, 0, 0, 0 :: (load 4 from %ir..i085.i, align 8, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0, 0 :: (load 16 from %ir.176, addrspace 4) + ; CHECK: early-clobber %71.sub0:sgpr_128, early-clobber %262:sgpr_128 = BUNDLE %261, %441 { + ; CHECK: internal %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %441, 0, 0, 0 :: (load 4 from %ir..i085.i, align 8, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0, 0 :: (load 16 from %ir.176, addrspace 4) + ; CHECK: } ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %273, 0, 0, 0 :: (load 16 from %ir.185, addrspace 4) ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0, 0 :: (load 16 from %ir.194, addrspace 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: early-clobber %445:sreg_32_xm0_xexec, early-clobber %287:sgpr_128 = BUNDLE %71, %286 { + ; CHECK: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0, 0 :: (load 16 from %ir.194, addrspace 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: } ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %293, 0, 0, 0 :: (load 16 from %ir.200, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc + ; CHECK: early-clobber %281:vgpr_32, early-clobber %275:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM1]], [[S_LOAD_DWORDX4_IMM16]], [[V_MOV_B32_e32_]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: } + ; CHECK: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc ; CHECK: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc @@ -180,16 +203,20 @@ ; CHECK: %453.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc ; CHECK: %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %453, 0, 0, 0 :: (load 8 from %ir.304, addrspace 4) ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0, 0 :: (load 16 from %ir.223, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0, 0 :: (load 16 from %ir.230, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0, 0 :: (load 16 from %ir.236, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0, 0 :: (load 16 from %ir.242, addrspace 4) + ; CHECK: early-clobber %336:sgpr_128, early-clobber %352:sgpr_128, early-clobber %328:sgpr_128, early-clobber %344:sgpr_128 = BUNDLE %327, %343, %335, %351 { + ; CHECK: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0, 0 :: (load 16 from %ir.223, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0, 0 :: (load 16 from %ir.230, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0, 0 :: (load 16 from %ir.236, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0, 0 :: (load 16 from %ir.242, addrspace 4) + ; CHECK: } ; CHECK: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 3, implicit-def dead $scc + ; CHECK: early-clobber %329:vgpr_32, early-clobber %345:vgpr_32, early-clobber %337:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM20]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], [[S_LOAD_DWORDX4_IMM21]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: } + ; CHECK: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc ; CHECK: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc @@ -197,18 +224,22 @@ ; CHECK: %468.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc ; CHECK: %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %468, 0, 0, 0 :: (load 8 from %ir.316, addrspace 4) ; CHECK: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0, 0 :: (load 16 from %ir.278, addrspace 4) - ; CHECK: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0, 0 :: (load 4 from `i32 addrspace(4)* undef`, addrspace 4) - ; CHECK: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY11]], 3, implicit-def dead $scc + ; CHECK: early-clobber %412:sgpr_128, early-clobber %487:sreg_32_xm0_xexec, early-clobber %475:sreg_32_xm0_xexec = BUNDLE %71, undef %488:sreg_64, %411 { + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0, 0 :: (load 16 from %ir.278, addrspace 4) + ; CHECK: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0, 0 :: (load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; CHECK: } + ; CHECK: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 3, implicit-def dead $scc ; CHECK: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %425, 0, 0, 0 :: (load 16 from %ir.287, addrspace 4) ; CHECK: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc ; CHECK: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc ; CHECK: undef %485.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_8]], implicit-def $scc ; CHECK: %485.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc ; CHECK: %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %485, 0, 0, 0 :: (load 4 from %ir..i0100.i, align 8, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: early-clobber %413:vgpr_32, early-clobber %427:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM23]], [[S_LOAD_DWORDX4_IMM24]], [[V_MOV_B32_e32_]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: } ; CHECK: %71.sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_]], implicit-def dead $scc ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc @@ -217,8 +248,7 @@ ; CHECK: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -507, implicit-def dead $scc ; CHECK: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -539, implicit-def dead $scc ; CHECK: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc - ; CHECK: [[SI_SPILL_S32_RESTORE:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr100_sgpr101_sgpr102_sgpr103, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) - ; CHECK: [[S_ADD_U32_5:%[0-9]+]]:sreg_32 = S_ADD_U32 [[SI_SPILL_S32_RESTORE]], 96, implicit-def $scc + ; CHECK: [[S_ADD_U32_5:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 96, implicit-def $scc ; CHECK: [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %33:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK: undef %514.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %514.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc @@ -229,9 +259,11 @@ ; CHECK: undef %530.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %530.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %530, 0, 0, 0 :: (load 16 from %ir.359, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: early-clobber %516:vgpr_32, early-clobber %532:vgpr_32, early-clobber %524:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM26]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], [[S_LOAD_DWORDX4_IMM27]], implicit $exec { + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: } ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec ; CHECK: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], implicit $exec ; CHECK: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec @@ -336,13 +368,8 @@ ; CHECK: [[V_OR_B32_e32_61:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_OR_B32_e32_60]], [[V_ADD_U32_e32_25]], implicit $exec ; CHECK: [[V_ADD_U32_e32_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -575, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec ; CHECK: [[V_OR_B32_e32_62:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_OR_B32_e32_61]], [[V_ADD_U32_e32_26]], implicit $exec - ; CHECK: [[SI_SPILL_S32_RESTORE1:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr100_sgpr101_sgpr102_sgpr103, implicit $sgpr32 :: (load 4 from %stack.0, addrspace 5) - ; CHECK: [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr100_sgpr101_sgpr102_sgpr103, implicit $sgpr32 :: (load 16 from %stack.1, align 4, addrspace 5) - ; CHECK: undef %914.sub2_sub3:sgpr_128 = COPY [[SI_SPILL_S128_RESTORE]].sub2_sub3 { - ; CHECK: internal %914.sub0:sgpr_128 = COPY [[SI_SPILL_S128_RESTORE]].sub0 - ; CHECK: } - ; CHECK: %914.sub1:sgpr_128 = COPY [[SI_SPILL_S32_RESTORE1]] - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %914, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[COPY13]].sub1:sgpr_128 = COPY [[S_AND_B32_]] + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY13]], 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[V_ADD_U32_e32_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -576, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec ; CHECK: [[V_OR_B32_e32_63:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_OR_B32_e32_62]], [[V_ADD_U32_e32_27]], implicit $exec ; CHECK: [[V_ADD_U32_e32_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -577, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll b/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll --- a/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll +++ b/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll @@ -1,10 +1,8 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx902 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx904 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NO-ECC %s +; RUN: llc -march=amdgcn -mcpu=gfx904 -mattr=+sramecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=+sramecc < %s | FileCheck -check-prefixes=GCN,ECC %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-sramecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s ; Make sure the correct set of targets are marked with ; FeatureDoesNotSupportSRAMECC, and +sram-ecc is ignored if it's never diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll @@ -0,0 +1,13 @@ +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s + +; REQUIRES: asserts + +; NOT-SUPPORTED: sramecc setting for subtarget: Unsupported +; ANY: sramecc setting for subtarget: Any +define void @sramecc-subtarget-feature-default() #0 { + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll @@ -0,0 +1,14 @@ +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s + +; REQUIRES: asserts + +; WARN: warning: sramecc 'Off' was requested for a processor that does not support it! +; OFF: sramecc setting for subtarget: Off + +define void @sramecc-subtarget-feature-disabled() #0 { + ret void +} + +attributes #0 = { "target-features"="-sramecc" } diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll @@ -0,0 +1,13 @@ +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s + +; REQUIRES: asserts + +; WARN: warning: sramecc 'On' was requested for a processor that does not support it! +; ON: sramecc setting for subtarget: On +define void @sramecc-subtarget-feature-enabled() #0 { + ret void +} + +attributes #0 = { "target-features"="+sramecc" } diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; @@ -51,43 +51,43 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 ; GFX9-NEXT: ds_write_b8 v0, v2 offset:8 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s4, s3, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_lshr_b32 s0, s7, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s7, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:13 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:15 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s6, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s2, s1, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 @@ -205,20 +205,20 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: ds_write_b16 v0, v1 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm @@ -287,15 +287,15 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3 ; GFX9-NEXT: s_endpgm ; @@ -335,14 +335,14 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; @@ -380,14 +380,14 @@ define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -6,13 +6,13 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; @@ -49,34 +49,34 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 ; GFX9-NEXT: ds_write_b8 v0, v2 offset:4 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s6, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s2, s1, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 @@ -172,17 +172,17 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: ds_write_b16 v0, v1 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm @@ -241,13 +241,13 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; GFX9-NEXT: ds_write_b32 v0, v3 offset:8 ; GFX9-NEXT: s_endpgm @@ -286,13 +286,13 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm @@ -331,13 +331,13 @@ define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -14,10 +14,10 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v0, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -123,11 +123,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -380,15 +380,15 @@ ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -432,19 +432,19 @@ ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v0 +; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -488,15 +488,15 @@ ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -542,15 +542,15 @@ ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v0 +; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -18,11 +18,12 @@ ; GFX9-LABEL: shuffle_v4f16_234u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -74,11 +75,12 @@ ; GFX9-LABEL: shuffle_v4f16_3u6u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -90,11 +92,12 @@ ; GFX9-LABEL: shuffle_v4f16_3uu7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -106,15 +109,15 @@ ; GFX9-LABEL: shuffle_v4f16_35u5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -126,14 +129,14 @@ ; GFX9-LABEL: shuffle_v4f16_357u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -173,9 +176,12 @@ ; GFX9-LABEL: shuffle_v4f16_0145: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -187,9 +193,12 @@ ; GFX9-LABEL: shuffle_v4f16_0167: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -229,9 +238,12 @@ ; GFX9-LABEL: shuffle_v4f16_2345: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -243,9 +255,12 @@ ; GFX9-LABEL: shuffle_v4f16_2367: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -257,11 +272,12 @@ ; GFX9-LABEL: shuffle_v4f16_4501: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -273,11 +289,12 @@ ; GFX9-LABEL: shuffle_v4f16_4523: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -316,11 +333,12 @@ ; GFX9-LABEL: shuffle_v4f16_6701: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -332,11 +350,12 @@ ; GFX9-LABEL: shuffle_v4f16_6723: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -376,13 +395,14 @@ ; GFX9-LABEL: shuffle_v4f16_2356: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -394,13 +414,14 @@ ; GFX9-LABEL: shuffle_v4f16_5623: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -412,14 +433,15 @@ ; GFX9-LABEL: shuffle_v4f16_3456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -431,15 +453,14 @@ ; GFX9-LABEL: shuffle_v4f16_5634: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -451,16 +472,16 @@ ; GFX9-LABEL: shuffle_v4f16_5734: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -472,13 +493,14 @@ ; GFX9-LABEL: shuffle_v4i16_2356: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 @@ -490,9 +512,12 @@ ; GFX9-LABEL: shuffle_v4i16_0167: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 @@ -556,12 +581,12 @@ ; GFX9-LABEL: shuffle_v4f16_6161: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -634,9 +659,12 @@ ; GFX9-LABEL: shuffle_v8f16_4589: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:8 -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:8 +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -648,11 +676,12 @@ ; GFX9-LABEL: shuffle_v8f16_10_11_2_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -664,13 +693,14 @@ ; GFX9-LABEL: shuffle_v8f16_13_14_2_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -713,14 +743,16 @@ ; GFX9-LABEL: shuffle_v6f16_452367: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-NEXT: global_load_dword v3, v[3:4], off +; GFX9-NEXT: global_load_dwordx3 v[0:2], v[5:6], off +; GFX9-NEXT: global_load_dword v7, v[3:4], off ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0 %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1 @@ -732,18 +764,18 @@ ; GFX9-LABEL: fma_shuffle: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[4:5] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm entry: %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -778,15 +810,15 @@ ; GFX9-LABEL: shuffle_v4f16_0456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v1, v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v7, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -10,21 +10,21 @@ ; GCN-NEXT: BB0_1: ; %bb0 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_2 Depth 2 -; GCN-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 +; GCN-NEXT: v_add_co_u32_e64 v6, vcc_lo, v0, 8 ; GCN-NEXT: s_mov_b32 s5, exec_lo -; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GCN-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] +; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: BB0_2: ; Parent Loop BB0_1 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s8, v4 -; GCN-NEXT: v_readfirstlane_b32 s9, v5 -; GCN-NEXT: v_readfirstlane_b32 s10, v2 -; GCN-NEXT: v_readfirstlane_b32 s11, v3 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5] -; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GCN-NEXT: v_readfirstlane_b32 s8, v2 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NEXT: v_readfirstlane_b32 s11, v5 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[2:3] +; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[4:5] ; GCN-NEXT: s_and_b32 s4, vcc_lo, s4 ; GCN-NEXT: s_and_saveexec_b32 s4, s4 ; GCN-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -march=amdgcn -mcpu=gfx902 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s + +; REQUIRES: asserts + +; NOT-SUPPORTED: xnack setting for subtarget: Unsupported +; ANY: xnack setting for subtarget: Any +define void @xnack-subtarget-feature-any() #0 { + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll @@ -0,0 +1,17 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s + +; REQUIRES: asserts + +; WARN: warning: xnack 'Off' was requested for a processor that does not support it! +; OFF: xnack setting for subtarget: Off + +define void @xnack-subtarget-feature-disabled() #0 { + ret void +} + +attributes #0 = { "target-features"="-xnack" } diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s + +; REQUIRES: asserts + +; WARN: warning: xnack 'On' was requested for a processor that does not support it! +; ON: xnack setting for subtarget: On +define void @xnack-subtarget-feature-enabled() #0 { + ret void +} + +attributes #0 = { "target-features"="+xnack" } diff --git a/llvm/test/MC/AMDGPU/xnack-mask.s b/llvm/test/MC/AMDGPU/xnack-mask.s --- a/llvm/test/MC/AMDGPU/xnack-mask.s +++ b/llvm/test/MC/AMDGPU/xnack-mask.s @@ -1,10 +1,10 @@ // RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 --implicit-check-not=error: %s // RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 --implicit-check-not=error: %s // RUN: not llvm-mc -arch=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 --implicit-check-not=error: %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1001 %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1001 -mattr=-xnack %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 --implicit-check-not=error: %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney %s 2>&1 | FileCheck -check-prefix=XNACKERR --implicit-check-not=error: %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -show-encoding %s | FileCheck -check-prefix=XNACK %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -mattr=+xnack %s 2>&1 | FileCheck -check-prefix=XNACKERR --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -mattr=+xnack -show-encoding %s | FileCheck -check-prefix=XNACK %s s_mov_b64 xnack_mask, -1 // NOSICIVI10: error: register not available on this GPU