diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -849,11 +849,12 @@ Ctx.diagnose(Diag); } - if (MFI->getLDSSize() > static_cast(STM.getLocalMemorySize())) { + if (MFI->getLDSSize() > + static_cast(STM.getAddressableLocalMemorySize())) { LLVMContext &Ctx = MF.getFunction().getContext(); - DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory", - MFI->getLDSSize(), - STM.getLocalMemorySize(), DS_Error); + DiagnosticInfoResourceLimit Diag( + MF.getFunction(), "local memory", MFI->getLDSSize(), + STM.getAddressableLocalMemorySize(), DS_Error); Ctx.diagnose(Diag); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -707,7 +707,7 @@ } } - LocalMemLimit = ST.getLocalMemorySize(); + LocalMemLimit = ST.getAddressableLocalMemorySize(); if (LocalMemLimit == 0) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -64,6 +64,7 @@ unsigned EUsPerCU = 4; unsigned MaxWavesPerEU = 10; unsigned LocalMemorySize = 0; + unsigned AddressableLocalMemorySize = 0; char WavefrontSizeLog2 = 0; public: @@ -210,6 +211,10 @@ return LocalMemorySize; } + unsigned getAddressableLocalMemorySize() const { + return AddressableLocalMemorySize; + } + // Number of SIMDs per "CU", where the "CU" is the unit onto which workgroups // are mapped. This takes WGP mode vs. CU mode into account. unsigned getEUsPerCU() const { return EUsPerCU; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -141,6 +141,12 @@ HasMovrel = true; } + AddressableLocalMemorySize = LocalMemorySize; + + if (AMDGPU::isGFX10Plus(*this) && + !getFeatureBits().test(AMDGPU::FeatureCuMode)) + LocalMemorySize *= 2; + // Don't crash on invalid devices. if (WavefrontSizeLog2 == 0) WavefrontSizeLog2 = 5; @@ -304,19 +310,29 @@ } } -unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, - const Function &F) const { - if (NWaves == 1) - return getLocalMemorySize(); - unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; - unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); - if (!WorkGroupsPerCu) - return 0; - unsigned MaxWaves = getMaxWavesPerEU(); - return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; +// Returns the maximum per-workgroup LDS allocation size (in bytes) that still +// allows the given function to achieve an occupancy of NWaves waves per +// SIMD / EU, taking into account only the function's *maximum* workgroup size. +unsigned +AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, + const Function &F) const { + const unsigned WaveSize = getWavefrontSize(); + const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; + const unsigned WavesPerWorkgroup = + std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize); + + const unsigned WorkGroupsPerCU = + std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup); + + return getLocalMemorySize() / WorkGroupsPerCU; } // FIXME: Should return min,max range. +// +// Returns the maximum occupancy, in number of waves per SIMD / EU, that can +// be achieved when only the given function is running on the machine; and +// taking into account the overall number of wave slots, the (maximum) workgroup +// size, and the per-workgroup LDS allocation size. unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &F) const { const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; @@ -338,10 +354,13 @@ NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); - // Round to the number of waves. - const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; + // Round to the number of waves per CU. + const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize); unsigned MaxWaves = NumGroups * MaxGroupNumWaves; + // Number of waves per EU (SIMD). + MaxWaves = divideCeil(MaxWaves, getEUsPerCU()); + // Clamp to the maximum possible number of waves. MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -874,10 +874,12 @@ return; } + unsigned TargetOccupancy = + std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF)); unsigned WavesAfter = - std::min(S.getTargetOccupancy(), PressureAfter.getOccupancy(ST)); + std::min(TargetOccupancy, PressureAfter.getOccupancy(ST)); unsigned WavesBefore = - std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST)); + std::min(TargetOccupancy, PressureBefore.getOccupancy(ST)); LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore << ", after " << WavesAfter << ".\n"); diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.cpp b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp --- a/llvm/lib/Target/AMDGPU/R600Subtarget.cpp +++ b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp @@ -28,7 +28,9 @@ InstrInfo(*this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), - InstrItins(getInstrItineraryForCPU(GPU)) {} + InstrItins(getInstrItineraryForCPU(GPU)) { + AddressableLocalMemorySize = LocalMemorySize; +} R600Subtarget &R600Subtarget::initializeSubtargetDependencies(const Triple &TT, StringRef GPU, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12624,7 +12624,8 @@ // We can report everything over the maximum size as 0. We can't report // based on the actual size because we don't know if it's accurate or not // at any given point. - Known.Zero.setHighBits(countLeadingZeros(getSubtarget()->getLocalMemorySize())); + Known.Zero.setHighBits( + countLeadingZeros(getSubtarget()->getAddressableLocalMemorySize())); break; } } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -188,6 +188,10 @@ /// \returns Local memory size in bytes for given subtarget \p STI. unsigned getLocalMemorySize(const MCSubtargetInfo *STI); +/// \returns Maximum addressable local memory size in bytes for given subtarget +/// \p STI. +unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI); + /// \returns Number of execution units per compute unit for given subtarget \p /// STI. unsigned getEUsPerCU(const MCSubtargetInfo *STI); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -804,11 +804,26 @@ } unsigned getLocalMemorySize(const MCSubtargetInfo *STI) { + unsigned BytesPerCU = 0; + if (STI->getFeatureBits().test(FeatureLocalMemorySize32768)) + BytesPerCU = 32768; + if (STI->getFeatureBits().test(FeatureLocalMemorySize65536)) + BytesPerCU = 65536; + + // "Per CU" really means "per whatever functional block the waves of a + // workgroup must share". So the effective local memory size is doubled in + // WGP mode on gfx10. + if (isGFX10Plus(*STI) && !STI->getFeatureBits().test(FeatureCuMode)) + BytesPerCU *= 2; + + return BytesPerCU; +} + +unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) { if (STI->getFeatureBits().test(FeatureLocalMemorySize32768)) return 32768; if (STI->getFeatureBits().test(FeatureLocalMemorySize65536)) return 65536; - return 0; } @@ -828,11 +843,18 @@ assert(FlatWorkGroupSize != 0); if (STI->getTargetTriple().getArch() != Triple::amdgcn) return 8; + unsigned MaxWaves = getMaxWavesPerEU(STI) * getEUsPerCU(STI); unsigned N = getWavesPerWorkGroup(STI, FlatWorkGroupSize); - if (N == 1) - return 40; - N = 40 / N; - return std::min(N, 16u); + if (N == 1) { + // Single-wave workgroups don't consume barrier resources. + return MaxWaves; + } + + unsigned MaxBarriers = 16; + if (isGFX10Plus(*STI) && !STI->getFeatureBits().test(FeatureCuMode)) + MaxBarriers = 32; + + return std::min(MaxWaves / N, MaxBarriers); } unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll @@ -492,29 +492,29 @@ ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v14, v[0:1] +; GFX8-NEXT: flat_load_ushort v16, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v4 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_add_u16_e32 v1, v6, v10 ; GFX8-NEXT: v_add_u16_sdwa v2, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e32 v3, v7, v11 -; GFX8-NEXT: v_add_u16_sdwa v10, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v11, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v6, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v7, v8, v12 ; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 +; GFX8-NEXT: v_add_u16_e32 v10, v9, v13 ; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v13, v14, v0 +; GFX8-NEXT: v_add_u16_e32 v11, v16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v10 -; GFX8-NEXT: v_or_b32_e32 v2, v11, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v1, v3, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: flat_store_short v[6:7], v13 +; GFX8-NEXT: flat_store_short v[14:15], v11 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -640,55 +640,55 @@ ; GFX8-LABEL: add_v11i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 18, v0 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v18, v[10:11] +; GFX8-NEXT: flat_load_ushort v19, v[12:13] +; GFX8-NEXT: flat_load_ushort v20, v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 18, v2 ; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v2 -; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_ushort v14, v[14:15] -; GFX8-NEXT: flat_load_ushort v15, v[16:17] -; GFX8-NEXT: flat_load_ushort v16, v[2:3] -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v1, v[14:15] +; GFX8-NEXT: flat_load_ushort v2, v[2:3] +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v4 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v4 +; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u16_e32 v17, v6, v10 +; GFX8-NEXT: v_add_u16_e32 v3, v6, v10 ; GFX8-NEXT: v_add_u16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 18, v0 -; GFX8-NEXT: v_add_u16_e32 v18, v7, v11 +; GFX8-NEXT: v_add_u16_e32 v21, v7, v11 ; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v0 -; GFX8-NEXT: flat_load_ushort v2, v[2:3] -; GFX8-NEXT: flat_load_ushort v3, v[6:7] -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v21, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 +; GFX8-NEXT: v_add_u16_e32 v22, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 20, v4 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GFX8-NEXT: v_add_u16_e32 v19, v8, v12 -; GFX8-NEXT: v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 18, v4 -; GFX8-NEXT: v_add_u16_e32 v20, v9, v13 -; GFX8-NEXT: v_add_u16_sdwa v13, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v17, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v18, v11 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 20, v4 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u16_e32 v14, v2, v14 +; GFX8-NEXT: v_add_u16_e32 v13, v18, v0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v15, v3, v15 -; GFX8-NEXT: v_or_b32_e32 v2, v19, v12 -; GFX8-NEXT: v_or_b32_e32 v3, v20, v13 +; GFX8-NEXT: v_add_u16_e32 v18, v19, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v16, v21, v16 +; GFX8-NEXT: v_add_u16_e32 v19, v20, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v3, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v21, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v22, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: flat_store_short v[6:7], v14 -; GFX8-NEXT: flat_store_short v[8:9], v15 -; GFX8-NEXT: flat_store_short v[10:11], v16 +; GFX8-NEXT: flat_store_short v[14:15], v13 +; GFX8-NEXT: flat_store_short v[16:17], v18 +; GFX8-NEXT: flat_store_short v[6:7], v19 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -771,34 +771,34 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v2, v6, v10 -; GFX8-NEXT: v_add_u16_sdwa v3, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v10, v7, v11 -; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX8-NEXT: v_add_u16_e32 v16, v8, v12 -; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u16_e32 v0, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v1, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v2, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v3, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v6, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v7, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v8, v9, v13 ; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v10, v11 -; GFX8-NEXT: v_or_b32_e32 v2, v16, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v9 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v6, v14, v16 +; GFX8-NEXT: v_add_u16_sdwa v7, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v8, v15, v17 +; GFX8-NEXT: v_add_u16_sdwa v9, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v8, v6, v14 -; GFX8-NEXT: v_add_u16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v9, v7, v15 -; GFX8-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v4 -; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v9, v7 +; GFX8-NEXT: v_or_b32_e32 v7, v8, v9 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7] ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -164,8 +164,6 @@ ; GFX9-NEXT: v_add_u32_e32 v16, 1, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5] @@ -187,7 +185,6 @@ ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v16 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc @@ -200,78 +197,82 @@ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v14, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v15, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v14, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v15, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[3:4] -; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 1, v2 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v16 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v10, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v11, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, v12, v8, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v2, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[2:3] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[2:3] ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_cndmask_b32_e32 v0, v18, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v15, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i128_vgpr_idx: @@ -286,7 +287,6 @@ ; GFX7-NEXT: v_add_i32_e32 v16, vcc, 1, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5] @@ -308,8 +308,6 @@ ; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 ; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:48 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v16 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc @@ -322,16 +320,19 @@ ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] ; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v3, v14, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, v15, s[4:5] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v14, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v15, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v4i128_vgpr_idx: @@ -339,37 +340,37 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 ; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v3 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v12, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v13, v9, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v14, v8, v10, s4 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_cndmask_b32_e32 v16, v12, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v17, v13, v15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v18, v12, v14, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v13, v15, s4 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v9, v11, s4 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v3 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v5, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v15, v5, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, v4, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v19, v5, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v12, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v17, v13, v7, vcc_lo -; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v7, s4 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v3 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s4 @@ -383,8 +384,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -320,9 +320,9 @@ ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_mov_b64 s[12:13], 5 ; GCN-NEXT: v_mov_b32_e32 v7, s10 ; GCN-NEXT: v_mov_b32_e32 v8, s11 -; GCN-NEXT: s_mov_b64 s[12:13], 5 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll @@ -733,33 +733,33 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] -; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v17 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX6-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX6-NEXT: v_rcp_f64_e32 v[18:19], v[12:13] +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[10:11] +; GFX6-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0 +; GFX6-NEXT: v_fma_f64 v[22:23], -v[8:9], v[14:15], v[16:17] +; GFX6-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19] +; GFX6-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX6-NEXT: v_fma_f64 v[16:17], -v[12:13], v[18:19], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 -; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[18:19], v[16:17], v[18:19] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] -; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] -; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] -; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_mul_f64 v[16:17], v[20:21], v[8:9] +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[22:23], v[10:11], v[14:15] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[12:13], v[16:17], v[20:21] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v21 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v13 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: s_nop 1 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] +; GFX6-NEXT: s_nop 0 +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -768,26 +768,26 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX8-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX8-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] +; GFX8-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] +; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] +; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] +; GFX8-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] ; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -796,26 +796,26 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX9-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX9-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] +; GFX9-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] +; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] +; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] +; GFX9-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] ; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -970,33 +970,33 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] -; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v17 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX6-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX6-NEXT: v_rcp_f64_e32 v[18:19], v[12:13] +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[10:11] +; GFX6-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0 +; GFX6-NEXT: v_fma_f64 v[22:23], -v[8:9], v[14:15], v[16:17] +; GFX6-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19] +; GFX6-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX6-NEXT: v_fma_f64 v[16:17], -v[12:13], v[18:19], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 -; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[18:19], v[16:17], v[18:19] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] -; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] -; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] -; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_mul_f64 v[16:17], v[20:21], v[8:9] +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[22:23], v[10:11], v[14:15] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[12:13], v[16:17], v[20:21] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v21 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v13 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: s_nop 1 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] +; GFX6-NEXT: s_nop 0 +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1005,26 +1005,26 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX8-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX8-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] +; GFX8-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] +; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] +; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] +; GFX8-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] ; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1033,26 +1033,26 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX9-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX9-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] +; GFX9-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] +; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] +; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] +; GFX9-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] ; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1133,7 +1133,7 @@ ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 +; GFX6-NEXT: v_mov_b32_e32 v20, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 @@ -1142,23 +1142,23 @@ ; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v9 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v20, v9 ; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] +; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v17 +; GFX6-NEXT: v_mul_f64 v[8:9], v[16:17], v[4:5] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13] +; GFX6-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v20, v17 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 ; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1295,7 +1295,7 @@ ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 +; GFX6-NEXT: v_mov_b32_e32 v20, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 @@ -1304,23 +1304,23 @@ ; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v9 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v20, v9 ; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] +; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v17 +; GFX6-NEXT: v_mul_f64 v[8:9], v[16:17], v[4:5] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13] +; GFX6-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v20, v17 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 ; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1533,7 +1533,7 @@ ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 +; GFX6-NEXT: v_mov_b32_e32 v20, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 @@ -1542,23 +1542,23 @@ ; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v9 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v20, v9 ; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] +; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v17 +; GFX6-NEXT: v_mul_f64 v[8:9], v[16:17], v[4:5] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13] +; GFX6-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v20, v17 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 ; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1769,33 +1769,33 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] -; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v17 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX6-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX6-NEXT: v_rcp_f64_e32 v[18:19], v[12:13] +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[10:11] +; GFX6-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0 +; GFX6-NEXT: v_fma_f64 v[22:23], -v[8:9], v[14:15], v[16:17] +; GFX6-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19] +; GFX6-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX6-NEXT: v_fma_f64 v[16:17], -v[12:13], v[18:19], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 -; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[18:19], v[16:17], v[18:19] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] -; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] -; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] -; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_mul_f64 v[16:17], v[20:21], v[8:9] +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[22:23], v[10:11], v[14:15] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[12:13], v[16:17], v[20:21] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v21 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v13 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: s_nop 1 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] +; GFX6-NEXT: s_nop 0 +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1804,26 +1804,26 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX8-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX8-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] +; GFX8-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] +; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] +; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] +; GFX8-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] ; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1832,26 +1832,26 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX9-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX9-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] +; GFX9-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] +; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] +; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] +; GFX9-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] ; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -7899,90 +7899,90 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 -; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[8:9], 1 ; GFX6-NEXT: v_xor_b32_e32 v16, -1, v16 -; GFX6-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX6-NEXT: v_lshlrev_b32_e32 v17, 31, v10 ; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], 1 -; GFX6-NEXT: v_or_b32_e32 v9, v9, v17 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24 -; GFX6-NEXT: v_or_b32_e32 v22, v18, v22 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[10:11], v16 -; GFX6-NEXT: v_lshr_b64 v[18:19], v[8:9], v24 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX6-NEXT: v_or_b32_e32 v18, v18, v16 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v23 -; GFX6-NEXT: v_or_b32_e32 v19, v19, v17 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v23 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v23 +; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v23 +; GFX6-NEXT: v_lshr_b64 v[16:17], v[0:1], v16 +; GFX6-NEXT: v_lshl_b64 v[18:19], v[2:3], v23 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], v23 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v25 +; GFX6-NEXT: v_or_b32_e32 v16, v16, v18 +; GFX6-NEXT: v_or_b32_e32 v17, v17, v19 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX6-NEXT: v_cndmask_b32_e32 v25, 0, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] -; GFX6-NEXT: v_subrev_i32_e64 v0, s[4:5], 64, v24 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v0 -; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v21, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v19, 0, v22, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 +; GFX6-NEXT: v_cndmask_b32_e32 v21, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v22, v1, v3, vcc +; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], 1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v10 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], 1 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v24 +; GFX6-NEXT: v_subrev_i32_e32 v23, vcc, 64, v24 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v24 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v10 +; GFX6-NEXT: v_lshr_b64 v[16:17], v[2:3], v24 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v23 +; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5] -; GFX6-NEXT: v_or_b32_e32 v0, v25, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v17, v8 -; GFX6-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v20 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; GFX6-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v20 -; GFX6-NEXT: v_and_b32_e32 v17, 0x7f, v8 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc +; GFX6-NEXT: v_or_b32_e32 v1, v19, v1 +; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v18 +; GFX6-NEXT: v_subrev_i32_e32 v20, vcc, 64, v18 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v8 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v16 -; GFX6-NEXT: v_subrev_i32_e32 v18, vcc, 64, v16 -; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v16 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v18 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v19, 0, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX6-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc +; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18 +; GFX6-NEXT: v_lshl_b64 v[16:17], v[4:5], v18 +; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v20 +; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; GFX6-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc ; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v17 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v17 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v19 +; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, 64, v19 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v19 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10 -; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, 64, v17 -; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v17 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v12 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GFX6-NEXT: v_lshr_b64 v[12:13], v[6:7], v19 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v14 +; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v4, v18, v4 -; GFX6-NEXT: v_or_b32_e32 v5, v19, v5 -; GFX6-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc +; GFX6-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v16, v4 +; GFX6-NEXT: v_or_b32_e32 v5, v17, v5 +; GFX6-NEXT: v_or_b32_e32 v6, v18, v6 ; GFX6-NEXT: v_or_b32_e32 v7, v20, v7 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -7990,90 +7990,90 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 -; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX8-NEXT: v_xor_b32_e32 v16, -1, v16 -; GFX8-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX8-NEXT: v_lshlrev_b32_e32 v17, 31, v10 ; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX8-NEXT: v_or_b32_e32 v9, v9, v17 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24 -; GFX8-NEXT: v_or_b32_e32 v22, v18, v22 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] -; GFX8-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX8-NEXT: v_or_b32_e32 v18, v18, v16 -; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v23 -; GFX8-NEXT: v_or_b32_e32 v19, v19, v17 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v23 +; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v23 +; GFX8-NEXT: v_lshrrev_b64 v[16:17], v16, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[18:19], v23, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1] +; GFX8-NEXT: v_or_b32_e32 v16, v16, v18 +; GFX8-NEXT: v_or_b32_e32 v17, v17, v19 ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v25, 0, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] -; GFX8-NEXT: v_subrev_u32_e64 v0, s[4:5], 64, v24 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] -; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v21, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v19, 0, v22, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 +; GFX8-NEXT: v_cndmask_b32_e32 v21, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v22, v1, v3, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v24 +; GFX8-NEXT: v_subrev_u32_e32 v23, vcc, 64, v24 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[16:17], v24, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v23, v[2:3] +; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v0, v25, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v17, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 ; GFX8-NEXT: v_xor_b32_e32 v8, -1, v20 -; GFX8-NEXT: v_and_b32_e32 v17, 0x7f, v8 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc +; GFX8-NEXT: v_or_b32_e32 v1, v19, v1 +; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v18 +; GFX8-NEXT: v_subrev_u32_e32 v20, vcc, 64, v18 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] -; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, 64, v16 -; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX8-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v18, v[4:5] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, 0, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] +; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v20, v[4:5] +; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc ; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v17 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v17, v[4:5] +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v19 +; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, 64, v19 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v19, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX8-NEXT: v_subrev_u32_e32 v12, vcc, 64, v17 -; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v17, v[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v12, v[6:7] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GFX8-NEXT: v_lshrrev_b64 v[12:13], v19, v[6:7] +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v14, v[6:7] +; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v4, v18, v4 -; GFX8-NEXT: v_or_b32_e32 v5, v19, v5 -; GFX8-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc +; GFX8-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX8-NEXT: v_or_b32_e32 v4, v16, v4 +; GFX8-NEXT: v_or_b32_e32 v5, v17, v5 +; GFX8-NEXT: v_or_b32_e32 v6, v18, v6 ; GFX8-NEXT: v_or_b32_e32 v7, v20, v7 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8081,90 +8081,90 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 -; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_xor_b32_e32 v16, -1, v16 -; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 31, v10 ; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX9-NEXT: v_or_b32_e32 v9, v9, v17 -; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24 -; GFX9-NEXT: v_or_b32_e32 v22, v18, v22 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] -; GFX9-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] +; GFX9-NEXT: v_sub_u32_e32 v16, 64, v23 +; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v23 +; GFX9-NEXT: v_lshrrev_b64 v[16:17], v16, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[18:19], v23, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1] +; GFX9-NEXT: v_or_b32_e32 v16, v16, v18 +; GFX9-NEXT: v_or_b32_e32 v17, v17, v19 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX9-NEXT: v_or_b32_e32 v18, v18, v16 -; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v23 -; GFX9-NEXT: v_or_b32_e32 v19, v19, v17 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v25, 0, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v24 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] -; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v21, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v22, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v22, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v10 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] +; GFX9-NEXT: v_sub_u32_e32 v10, 64, v24 +; GFX9-NEXT: v_subrev_u32_e32 v23, 64, v24 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], v24, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v23, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5] -; GFX9-NEXT: v_or_b32_e32 v0, v25, v2 -; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX9-NEXT: v_or_b32_e32 v2, v17, v8 -; GFX9-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v20 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v20 -; GFX9-NEXT: v_and_b32_e32 v17, 0x7f, v8 -; GFX9-NEXT: v_sub_u32_e32 v8, 64, v16 +; GFX9-NEXT: v_or_b32_e32 v1, v19, v1 +; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX9-NEXT: v_sub_u32_e32 v8, 64, v18 +; GFX9-NEXT: v_subrev_u32_e32 v20, 64, v18 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] -; GFX9-NEXT: v_subrev_u32_e32 v18, 64, v16 -; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v18, v[4:5] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v20, v[4:5] +; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc ; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 31, v14 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] -; GFX9-NEXT: v_sub_u32_e32 v10, 64, v17 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v17, v[4:5] +; GFX9-NEXT: v_sub_u32_e32 v10, 64, v19 +; GFX9-NEXT: v_subrev_u32_e32 v14, 64, v19 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v19, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX9-NEXT: v_subrev_u32_e32 v12, 64, v17 -; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v17, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[6:7], v12, v[6:7] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], v19, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v14, v[6:7] +; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v4, v18, v4 -; GFX9-NEXT: v_or_b32_e32 v5, v19, v5 -; GFX9-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc +; GFX9-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v16, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v17, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v18, v6 ; GFX9-NEXT: v_or_b32_e32 v7, v20, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -8041,274 +8041,274 @@ ; GFX6-LABEL: v_fshr_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_xor_b32_e32 v17, -1, v16 +; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 +; GFX6-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v17 -; GFX6-NEXT: v_lshrrev_b32_e32 v17, 31, v1 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v17 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 -; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23 ; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24 -; GFX6-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX6-NEXT: v_or_b32_e32 v22, v18, v22 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[10:11], v16 -; GFX6-NEXT: v_lshr_b64 v[18:19], v[8:9], v24 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX6-NEXT: v_or_b32_e32 v18, v18, v16 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v23 -; GFX6-NEXT: v_or_b32_e32 v19, v19, v17 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v23 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX6-NEXT: v_cndmask_b32_e32 v25, 0, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] -; GFX6-NEXT: v_subrev_i32_e64 v0, s[4:5], 64, v24 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v0 -; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24 +; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v24 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[16:17], v0 +; GFX6-NEXT: v_lshl_b64 v[18:19], v[2:3], v24 +; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v24 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[16:17], v24 +; GFX6-NEXT: v_or_b32_e32 v18, v0, v18 +; GFX6-NEXT: v_or_b32_e32 v19, v1, v19 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[16:17], v25 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 +; GFX6-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v18, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5] -; GFX6-NEXT: v_or_b32_e32 v0, v25, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v17, v8 +; GFX6-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v23 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v23 +; GFX6-NEXT: v_lshl_b64 v[2:3], v[10:11], v2 +; GFX6-NEXT: v_subrev_i32_e32 v24, vcc, 64, v23 +; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24 +; GFX6-NEXT: v_lshr_b64 v[16:17], v[10:11], v23 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v20 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; GFX6-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX6-NEXT: v_and_b32_e32 v17, 0x7f, v8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] +; GFX6-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc ; GFX6-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v17 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v19 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v17 -; GFX6-NEXT: v_subrev_i32_e32 v18, vcc, 64, v17 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v19 +; GFX6-NEXT: v_or_b32_e32 v2, v18, v2 +; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 +; GFX6-NEXT: v_subrev_i32_e32 v20, vcc, 64, v19 +; GFX6-NEXT: v_lshl_b64 v[16:17], v[8:9], v19 ; GFX6-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX6-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v17 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v18 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v20 -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v19, 0, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v16 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v16 +; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v20 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX6-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v18 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v18 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[14:15], v6 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, 64, v16 -; GFX6-NEXT: v_or_b32_e32 v11, v4, v6 -; GFX6-NEXT: v_or_b32_e32 v17, v5, v7 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v10 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v16 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc -; GFX6-NEXT: v_or_b32_e32 v4, v18, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v19, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v10 -; GFX6-NEXT: v_or_b32_e32 v7, v9, v11 +; GFX6-NEXT: v_subrev_i32_e32 v19, vcc, 64, v18 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v6 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v7 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v19 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[14:15], v18 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 +; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc +; GFX6-NEXT: v_or_b32_e32 v0, v21, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v22, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v16, v4 +; GFX6-NEXT: v_or_b32_e32 v5, v17, v5 +; GFX6-NEXT: v_or_b32_e32 v6, v10, v6 +; GFX6-NEXT: v_or_b32_e32 v7, v11, v7 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v17, -1, v16 +; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 +; GFX8-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 31, v1 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 -; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] ; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24 -; GFX8-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX8-NEXT: v_or_b32_e32 v22, v18, v22 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] -; GFX8-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX8-NEXT: v_or_b32_e32 v18, v18, v16 -; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v23 -; GFX8-NEXT: v_or_b32_e32 v19, v19, v17 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v25, 0, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] -; GFX8-NEXT: v_subrev_u32_e64 v0, s[4:5], 64, v24 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] -; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] +; GFX8-NEXT: v_lshlrev_b64 v[16:17], 1, v[0:1] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v24 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[16:17] +; GFX8-NEXT: v_lshlrev_b64 v[18:19], v24, v[2:3] +; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v24 +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v24, v[16:17] +; GFX8-NEXT: v_or_b32_e32 v18, v0, v18 +; GFX8-NEXT: v_or_b32_e32 v19, v1, v19 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v25, v[16:17] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 +; GFX8-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v18, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v0, v25, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v17, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v23 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v23, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] +; GFX8-NEXT: v_subrev_u32_e32 v24, vcc, 64, v23 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX8-NEXT: v_xor_b32_e32 v8, -1, v20 ; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX8-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX8-NEXT: v_and_b32_e32 v17, 0x7f, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc ; GFX8-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v17 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v19 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7] -; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, 64, v17 +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v19, v[6:7] +; GFX8-NEXT: v_or_b32_e32 v2, v18, v2 +; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 +; GFX8-NEXT: v_subrev_u32_e32 v20, vcc, 64, v19 +; GFX8-NEXT: v_lshlrev_b64 v[16:17], v19, v[8:9] ; GFX8-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX8-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v17, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[8:9], v18, v[8:9] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, 0, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v16 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v16, v[12:13] +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v20, v[8:9] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v18 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v18, v[12:13] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, 64, v16 -; GFX8-NEXT: v_or_b32_e32 v11, v4, v6 -; GFX8-NEXT: v_or_b32_e32 v17, v5, v7 -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v10, v[14:15] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v16, v[14:15] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v4, v18, v6 -; GFX8-NEXT: v_or_b32_e32 v5, v19, v7 -; GFX8-NEXT: v_or_b32_e32 v6, v8, v10 -; GFX8-NEXT: v_or_b32_e32 v7, v9, v11 +; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, 64, v18 +; GFX8-NEXT: v_or_b32_e32 v6, v4, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v5, v7 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v19, v[14:15] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v18, v[14:15] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v21, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v22, v1 +; GFX8-NEXT: v_or_b32_e32 v4, v16, v4 +; GFX8-NEXT: v_or_b32_e32 v5, v17, v5 +; GFX8-NEXT: v_or_b32_e32 v6, v10, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v11, v7 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v17, -1, v16 +; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 +; GFX9-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 31, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: v_or_b32_e32 v2, v2, v17 -; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 -; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] ; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24 -; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX9-NEXT: v_or_b32_e32 v22, v18, v22 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] -; GFX9-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 64, v24 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[16:17] +; GFX9-NEXT: v_lshlrev_b64 v[18:19], v24, v[2:3] +; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v24 +; GFX9-NEXT: v_lshlrev_b64 v[21:22], v24, v[16:17] +; GFX9-NEXT: v_or_b32_e32 v18, v0, v18 +; GFX9-NEXT: v_or_b32_e32 v19, v1, v19 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[16:17] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v18, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v23, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] +; GFX9-NEXT: v_subrev_u32_e32 v24, 64, v23 +; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX9-NEXT: v_or_b32_e32 v18, v18, v16 -; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v23 -; GFX9-NEXT: v_or_b32_e32 v19, v19, v17 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v25, 0, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v24 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] -; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5] -; GFX9-NEXT: v_or_b32_e32 v0, v25, v2 -; GFX9-NEXT: v_or_b32_e32 v2, v17, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v20 ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX9-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX9-NEXT: v_and_b32_e32 v17, 0x7f, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, 64, v17 +; GFX9-NEXT: v_sub_u32_e32 v4, 64, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7] -; GFX9-NEXT: v_subrev_u32_e32 v18, 64, v17 +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v19, v[6:7] +; GFX9-NEXT: v_or_b32_e32 v2, v18, v2 +; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 +; GFX9-NEXT: v_subrev_u32_e32 v20, 64, v19 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v19, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX9-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v17, v[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[8:9], v18, v[8:9] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, 64, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v16, v[12:13] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v20, v[8:9] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc +; GFX9-NEXT: v_sub_u32_e32 v6, 64, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v18, v[12:13] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] -; GFX9-NEXT: v_subrev_u32_e32 v10, 64, v16 -; GFX9-NEXT: v_or_b32_e32 v11, v4, v6 -; GFX9-NEXT: v_or_b32_e32 v17, v5, v7 -; GFX9-NEXT: v_lshrrev_b64 v[6:7], v10, v[14:15] -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v16, v[14:15] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc -; GFX9-NEXT: v_or_b32_e32 v4, v18, v6 -; GFX9-NEXT: v_or_b32_e32 v5, v19, v7 -; GFX9-NEXT: v_or_b32_e32 v6, v8, v10 -; GFX9-NEXT: v_or_b32_e32 v7, v9, v11 +; GFX9-NEXT: v_subrev_u32_e32 v19, 64, v18 +; GFX9-NEXT: v_or_b32_e32 v6, v4, v6 +; GFX9-NEXT: v_or_b32_e32 v7, v5, v7 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v19, v[14:15] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v18, v[14:15] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc +; GFX9-NEXT: v_or_b32_e32 v0, v21, v0 +; GFX9-NEXT: v_or_b32_e32 v1, v22, v1 +; GFX9-NEXT: v_or_b32_e32 v4, v16, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v17, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v10, v6 +; GFX9-NEXT: v_or_b32_e32 v7, v11, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshr_v2i128: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -904,23 +904,23 @@ ; GFX10-NEXT: v_mov_b32_e32 v18, s19 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 3, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s10, 2, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 4, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 2, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 3, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 5, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s10, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 6, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s9, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v0, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v1, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v1, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v1, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v0, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v0, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v1, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v1, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v1, s7 ; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v0, s8 ; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v0, s9 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll @@ -7,111 +7,107 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) { ; LOOP-LABEL: memcpy_p1i8: ; LOOP: ; %bb.0: -; LOOP-NEXT: s_mov_b32 s2, 0 -; LOOP-NEXT: s_mov_b32 s3, 0xf000 -; LOOP-NEXT: s_mov_b64 s[0:1], 0 +; LOOP-NEXT: s_mov_b32 s6, 0 +; LOOP-NEXT: s_mov_b32 s7, 0xf000 +; LOOP-NEXT: s_mov_b64 s[4:5], 0 ; LOOP-NEXT: v_mov_b32_e32 v5, v3 ; LOOP-NEXT: v_mov_b32_e32 v4, v2 ; LOOP-NEXT: v_mov_b32_e32 v7, v1 ; LOOP-NEXT: v_mov_b32_e32 v6, v0 -; LOOP-NEXT: v_mov_b32_e32 v8, s2 +; LOOP-NEXT: v_mov_b32_e32 v8, s6 ; LOOP-NEXT: .LBB0_1: ; %load-store-loop ; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1 -; LOOP-NEXT: s_waitcnt expcnt(1) -; LOOP-NEXT: buffer_load_ubyte v9, v[4:5], s[0:3], 0 addr64 +; LOOP-NEXT: buffer_load_ubyte v9, v[4:5], s[4:7], 0 addr64 +; LOOP-NEXT: s_waitcnt expcnt(6) +; LOOP-NEXT: buffer_load_ubyte v10, v[4:5], s[4:7], 0 addr64 offset:1 +; LOOP-NEXT: s_waitcnt expcnt(3) +; LOOP-NEXT: buffer_load_ubyte v11, v[4:5], s[4:7], 0 addr64 offset:2 ; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: buffer_load_ubyte v10, v[4:5], s[0:3], 0 addr64 offset:1 -; LOOP-NEXT: s_waitcnt vmcnt(0) +; LOOP-NEXT: buffer_load_ubyte v12, v[4:5], s[4:7], 0 addr64 offset:3 +; LOOP-NEXT: buffer_load_ubyte v13, v[4:5], s[4:7], 0 addr64 offset:4 +; LOOP-NEXT: buffer_load_ubyte v14, v[4:5], s[4:7], 0 addr64 offset:5 +; LOOP-NEXT: buffer_load_ubyte v15, v[4:5], s[4:7], 0 addr64 offset:6 +; LOOP-NEXT: buffer_load_ubyte v16, v[4:5], s[4:7], 0 addr64 offset:7 +; LOOP-NEXT: buffer_load_ubyte v17, v[4:5], s[4:7], 0 addr64 offset:8 +; LOOP-NEXT: buffer_load_ubyte v18, v[4:5], s[4:7], 0 addr64 offset:9 +; LOOP-NEXT: buffer_load_ubyte v19, v[4:5], s[4:7], 0 addr64 offset:10 +; LOOP-NEXT: buffer_load_ubyte v20, v[4:5], s[4:7], 0 addr64 offset:11 +; LOOP-NEXT: buffer_load_ubyte v21, v[4:5], s[4:7], 0 addr64 offset:12 +; LOOP-NEXT: buffer_load_ubyte v22, v[4:5], s[4:7], 0 addr64 offset:13 +; LOOP-NEXT: buffer_load_ubyte v23, v[4:5], s[4:7], 0 addr64 offset:14 +; LOOP-NEXT: buffer_load_ubyte v24, v[4:5], s[4:7], 0 addr64 offset:15 +; LOOP-NEXT: v_add_i32_e32 v8, vcc, 1, v8 +; LOOP-NEXT: s_xor_b64 s[0:1], vcc, -1 +; LOOP-NEXT: s_xor_b64 s[0:1], s[0:1], -1 +; LOOP-NEXT: s_and_b64 vcc, s[0:1], exec +; LOOP-NEXT: s_waitcnt vmcnt(14) ; LOOP-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; LOOP-NEXT: v_or_b32_e32 v9, v10, v9 -; LOOP-NEXT: buffer_load_ubyte v10, v[4:5], s[0:3], 0 addr64 offset:2 -; LOOP-NEXT: buffer_load_ubyte v11, v[4:5], s[0:3], 0 addr64 offset:3 -; LOOP-NEXT: s_waitcnt vmcnt(0) -; LOOP-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; LOOP-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; LOOP-NEXT: v_or_b32_e32 v10, v11, v10 -; LOOP-NEXT: v_or_b32_e32 v9, v10, v9 -; LOOP-NEXT: buffer_load_ubyte v10, v[4:5], s[0:3], 0 addr64 offset:4 -; LOOP-NEXT: buffer_load_ubyte v11, v[4:5], s[0:3], 0 addr64 offset:5 -; LOOP-NEXT: s_waitcnt vmcnt(0) -; LOOP-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; LOOP-NEXT: v_or_b32_e32 v10, v11, v10 -; LOOP-NEXT: buffer_load_ubyte v11, v[4:5], s[0:3], 0 addr64 offset:6 -; LOOP-NEXT: buffer_load_ubyte v12, v[4:5], s[0:3], 0 addr64 offset:7 -; LOOP-NEXT: s_waitcnt vmcnt(0) +; LOOP-NEXT: s_waitcnt vmcnt(12) ; LOOP-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; LOOP-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; LOOP-NEXT: v_or_b32_e32 v11, v12, v11 -; LOOP-NEXT: v_or_b32_e32 v10, v11, v10 -; LOOP-NEXT: buffer_load_ubyte v11, v[4:5], s[0:3], 0 addr64 offset:8 -; LOOP-NEXT: buffer_load_ubyte v12, v[4:5], s[0:3], 0 addr64 offset:9 -; LOOP-NEXT: s_waitcnt vmcnt(0) -; LOOP-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; LOOP-NEXT: v_or_b32_e32 v11, v12, v11 -; LOOP-NEXT: buffer_load_ubyte v12, v[4:5], s[0:3], 0 addr64 offset:10 -; LOOP-NEXT: buffer_load_ubyte v13, v[4:5], s[0:3], 0 addr64 offset:11 -; LOOP-NEXT: s_waitcnt vmcnt(0) -; LOOP-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; LOOP-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; LOOP-NEXT: v_or_b32_e32 v12, v13, v12 -; LOOP-NEXT: v_or_b32_e32 v11, v12, v11 -; LOOP-NEXT: buffer_load_ubyte v12, v[4:5], s[0:3], 0 addr64 offset:12 -; LOOP-NEXT: buffer_load_ubyte v13, v[4:5], s[0:3], 0 addr64 offset:13 -; LOOP-NEXT: s_waitcnt vmcnt(0) -; LOOP-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; LOOP-NEXT: v_or_b32_e32 v12, v13, v12 -; LOOP-NEXT: buffer_load_ubyte v13, v[4:5], s[0:3], 0 addr64 offset:14 -; LOOP-NEXT: buffer_load_ubyte v14, v[4:5], s[0:3], 0 addr64 offset:15 +; LOOP-NEXT: s_waitcnt vmcnt(10) +; LOOP-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; LOOP-NEXT: s_waitcnt vmcnt(8) +; LOOP-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; LOOP-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; LOOP-NEXT: s_waitcnt vmcnt(6) +; LOOP-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; LOOP-NEXT: s_waitcnt vmcnt(4) +; LOOP-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; LOOP-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; LOOP-NEXT: s_waitcnt vmcnt(2) +; LOOP-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; LOOP-NEXT: s_waitcnt vmcnt(0) -; LOOP-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; LOOP-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; LOOP-NEXT: v_or_b32_e32 v13, v14, v13 -; LOOP-NEXT: v_or_b32_e32 v12, v13, v12 +; LOOP-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; LOOP-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; LOOP-NEXT: v_or_b32_e32 v9, v10, v9 +; LOOP-NEXT: v_or_b32_e32 v10, v12, v11 +; LOOP-NEXT: v_or_b32_e32 v11, v14, v13 +; LOOP-NEXT: v_or_b32_e32 v12, v16, v15 +; LOOP-NEXT: v_or_b32_e32 v13, v18, v17 +; LOOP-NEXT: v_or_b32_e32 v14, v20, v19 +; LOOP-NEXT: v_or_b32_e32 v15, v22, v21 +; LOOP-NEXT: v_or_b32_e32 v16, v24, v23 +; LOOP-NEXT: v_or_b32_e32 v9, v10, v9 +; LOOP-NEXT: v_or_b32_e32 v10, v12, v11 +; LOOP-NEXT: v_or_b32_e32 v11, v14, v13 +; LOOP-NEXT: v_or_b32_e32 v12, v16, v15 ; LOOP-NEXT: v_lshrrev_b32_e32 v13, 16, v9 ; LOOP-NEXT: v_bfe_u32 v14, v9, 8, 8 -; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 -; LOOP-NEXT: buffer_store_byte v14, v[6:7], s[0:3], 0 addr64 offset:1 -; LOOP-NEXT: s_waitcnt expcnt(1) +; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[4:7], 0 addr64 +; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v9, 24, v9 -; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:2 -; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:3 +; LOOP-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; LOOP-NEXT: v_bfe_u32 v16, v10, 8, 8 +; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[4:7], 0 addr64 offset:4 ; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: v_lshrrev_b32_e32 v9, 16, v10 -; LOOP-NEXT: v_bfe_u32 v13, v10, 8, 8 -; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:4 -; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:5 -; LOOP-NEXT: s_waitcnt expcnt(1) ; LOOP-NEXT: v_lshrrev_b32_e32 v10, 24, v10 -; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:6 -; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:7 -; LOOP-NEXT: s_waitcnt expcnt(1) -; LOOP-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: v_bfe_u32 v10, v11, 8, 8 -; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[0:3], 0 addr64 offset:8 -; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:9 +; LOOP-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; LOOP-NEXT: v_bfe_u32 v18, v11, 8, 8 +; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[4:7], 0 addr64 offset:8 ; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: v_lshrrev_b32_e32 v10, 24, v11 -; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:10 -; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:11 -; LOOP-NEXT: s_waitcnt expcnt(1) -; LOOP-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; LOOP-NEXT: v_lshrrev_b32_e32 v11, 24, v11 +; LOOP-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; LOOP-NEXT: v_bfe_u32 v20, v12, 8, 8 +; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[4:7], 0 addr64 offset:12 ; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: v_bfe_u32 v10, v12, 8, 8 -; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[0:3], 0 addr64 offset:12 -; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:13 -; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: v_lshrrev_b32_e32 v10, 24, v12 -; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:14 -; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:15 -; LOOP-NEXT: v_add_i32_e32 v8, vcc, 1, v8 -; LOOP-NEXT: s_xor_b64 s[4:5], vcc, -1 -; LOOP-NEXT: v_add_i32_e32 v6, vcc, 16, v6 -; LOOP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; LOOP-NEXT: v_add_i32_e32 v4, vcc, 16, v4 -; LOOP-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; LOOP-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; LOOP-NEXT: s_and_b64 vcc, exec, s[4:5] +; LOOP-NEXT: v_lshrrev_b32_e32 v12, 24, v12 +; LOOP-NEXT: buffer_store_byte v14, v[6:7], s[4:7], 0 addr64 offset:1 +; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[4:7], 0 addr64 offset:2 +; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[4:7], 0 addr64 offset:3 +; LOOP-NEXT: buffer_store_byte v16, v[6:7], s[4:7], 0 addr64 offset:5 +; LOOP-NEXT: buffer_store_byte v15, v[6:7], s[4:7], 0 addr64 offset:6 +; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[4:7], 0 addr64 offset:7 +; LOOP-NEXT: buffer_store_byte v18, v[6:7], s[4:7], 0 addr64 offset:9 +; LOOP-NEXT: buffer_store_byte v17, v[6:7], s[4:7], 0 addr64 offset:10 +; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[4:7], 0 addr64 offset:11 +; LOOP-NEXT: buffer_store_byte v20, v[6:7], s[4:7], 0 addr64 offset:13 +; LOOP-NEXT: buffer_store_byte v19, v[6:7], s[4:7], 0 addr64 offset:14 +; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[4:7], 0 addr64 offset:15 +; LOOP-NEXT: v_add_i32_e64 v6, s[0:1], 16, v6 +; LOOP-NEXT: v_addc_u32_e64 v7, s[0:1], 0, v7, s[0:1] +; LOOP-NEXT: v_add_i32_e64 v4, s[0:1], 16, v4 +; LOOP-NEXT: v_addc_u32_e64 v5, s[0:1], 0, v5, s[0:1] ; LOOP-NEXT: s_cbranch_vccnz .LBB0_1 ; LOOP-NEXT: ; %bb.2: ; %memcpy-split ; LOOP-NEXT: s_mov_b32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -1645,208 +1645,208 @@ ; GFX7-LABEL: v_mul_i256: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] -; GFX7-NEXT: v_mul_lo_u32 v28, v4, v11 -; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] -; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc -; GFX7-NEXT: v_mov_b32_e32 v20, v18 -; GFX7-NEXT: v_mov_b32_e32 v18, v19 -; GFX7-NEXT: v_mov_b32_e32 v19, v16 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9 -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] -; GFX7-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] -; GFX7-NEXT: v_mov_b32_e32 v19, v22 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] -; GFX7-NEXT: v_mul_lo_u32 v24, v3, v12 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] -; GFX7-NEXT: v_mul_lo_u32 v22, v2, v13 -; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] -; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] -; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX7-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] -; GFX7-NEXT: v_mov_b32_e32 v20, v11 -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX7-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13] -; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13] -; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc -; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, v10 +; GFX7-NEXT: v_mov_b32_e32 v16, v0 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0 +; GFX7-NEXT: v_mov_b32_e32 v17, v1 +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1] +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19] +; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20] +; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20] +; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; GFX7-NEXT: v_mov_b32_e32 v18, v23 +; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1] +; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, v20 +; GFX7-NEXT: v_mov_b32_e32 v1, v23 +; GFX7-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1] +; GFX7-NEXT: v_mul_lo_u32 v20, v6, v9 +; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9] +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19] +; GFX7-NEXT: v_mul_lo_u32 v23, v5, v10 +; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11 +; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1] +; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19] +; GFX7-NEXT: v_mul_lo_u32 v13, v2, v13 +; GFX7-NEXT: v_mov_b32_e32 v2, v22 +; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2] +; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] +; GFX7-NEXT: v_mul_lo_u32 v12, v3, v12 +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9] +; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2] +; GFX7-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9] +; GFX7-NEXT: v_mul_lo_u32 v10, v16, v15 +; GFX7-NEXT: v_mul_lo_u32 v9, v17, v14 +; GFX7-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] -; GFX8-NEXT: v_mul_lo_u32 v28, v4, v11 -; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] -; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc -; GFX8-NEXT: v_mov_b32_e32 v20, v18 -; GFX8-NEXT: v_mov_b32_e32 v18, v19 -; GFX8-NEXT: v_mov_b32_e32 v19, v16 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] -; GFX8-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] -; GFX8-NEXT: v_mov_b32_e32 v19, v22 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] -; GFX8-NEXT: v_mul_lo_u32 v24, v3, v12 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] -; GFX8-NEXT: v_mul_lo_u32 v22, v2, v13 -; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] -; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX8-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] -; GFX8-NEXT: v_mov_b32_e32 v20, v11 -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX8-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13] -; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13] -; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, v10 +; GFX8-NEXT: v_mov_b32_e32 v16, v0 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0 +; GFX8-NEXT: v_mov_b32_e32 v17, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19] +; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20] +; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20] +; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; GFX8-NEXT: v_mov_b32_e32 v18, v23 +; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1] +; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, v20 +; GFX8-NEXT: v_mov_b32_e32 v1, v23 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1] +; GFX8-NEXT: v_mul_lo_u32 v20, v6, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19] +; GFX8-NEXT: v_mul_lo_u32 v23, v5, v10 +; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11 +; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1] +; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19] +; GFX8-NEXT: v_mul_lo_u32 v13, v2, v13 +; GFX8-NEXT: v_mov_b32_e32 v2, v22 +; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2] +; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] +; GFX8-NEXT: v_mul_lo_u32 v12, v3, v12 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9] +; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2] +; GFX8-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9] +; GFX8-NEXT: v_mul_lo_u32 v10, v16, v15 +; GFX8-NEXT: v_mul_lo_u32 v9, v17, v14 +; GFX8-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] -; GFX9-NEXT: v_mul_lo_u32 v28, v4, v11 -; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc -; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v20, vcc -; GFX9-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v16 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] -; GFX9-NEXT: v_addc_co_u32_e64 v26, s[4:5], 0, v6, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] -; GFX9-NEXT: v_mov_b32_e32 v19, v22 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] -; GFX9-NEXT: v_mul_lo_u32 v24, v3, v12 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] -; GFX9-NEXT: v_mul_lo_u32 v22, v2, v13 -; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13] -; GFX9-NEXT: v_mov_b32_e32 v20, v11 -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[12:13], 0, v2, s[12:13] -; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v12, v3, s[12:13] -; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v26, v4, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v11, v5, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v25, v6, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v17, v0, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v0, v9, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v22, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v24, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v28, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-NEXT: v_mov_b32_e32 v16, v0 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0 +; GFX9-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19] +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v24, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20] +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20] +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_mov_b32_e32 v18, v23 +; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1] +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-NEXT: v_mov_b32_e32 v1, v23 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1] +; GFX9-NEXT: v_mul_lo_u32 v20, v6, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19] +; GFX9-NEXT: v_mul_lo_u32 v23, v5, v10 +; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19] +; GFX9-NEXT: v_mul_lo_u32 v13, v2, v13 +; GFX9-NEXT: v_mov_b32_e32 v2, v22 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9] +; GFX9-NEXT: v_mul_lo_u32 v12, v3, v12 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v18, s[8:9], 0, v6, s[8:9] +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[8:9], v9, v3, s[8:9] +; GFX9-NEXT: v_mul_lo_u32 v10, v16, v15 +; GFX9-NEXT: v_mul_lo_u32 v9, v17, v14 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[8:9], v25, v4, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[8:9], v18, v5, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], v21, v6, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[8:9], v24, v10, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v10, v9, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v13, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v12, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v9, v26, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v9, v23, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i256: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -425,24 +425,24 @@ ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v0, v9 +; GISEL-NEXT: v_xor_b32_e32 v12, v0, v9 ; GISEL-NEXT: v_mul_lo_u32 v0, v15, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v16, v11 ; GISEL-NEXT: v_xor_b32_e32 v14, v1, v9 ; GISEL-NEXT: v_mul_hi_u32 v1, v16, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v13, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; GISEL-NEXT: v_mul_hi_u32 v11, v15, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -451,189 +451,189 @@ ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v10, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v12, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v12, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v15, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v1 +; GISEL-NEXT: v_mul_lo_u32 v13, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v11, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v14, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v15, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v10 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v12, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v16, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v15, v[10:11] +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v15, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v13, v[10:11] +; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v14, v10, vcc +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v14, v10 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v1, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v7 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v10 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v10, vcc ; GISEL-NEXT: v_xor_b32_e32 v7, v1, v10 -; GISEL-NEXT: v_xor_b32_e32 v12, v6, v10 +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v10 ; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v12 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v14, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v15, vcc +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v14 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v14, v11, vcc -; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v0, v4, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v11, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0 -; GISEL-NEXT: v_sub_i32_e32 v18, vcc, 0, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v18, v14, 0 -; GISEL-NEXT: v_subb_u32_e32 v19, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v5 -; GISEL-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v6, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v18, v11, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v11, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v20, v4 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v19, v14, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v20, v21, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v20, vcc, 1, v6 -; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v20, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v17, v17, v21, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_lo_u32 v13, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v20, v13 +; GISEL-NEXT: v_trunc_f32_e32 v18, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v18 +; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v7 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v18 +; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v6, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v12, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v14, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v18, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v21, v19, v[4:5] +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v17, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v19, v4 +; GISEL-NEXT: v_mul_hi_u32 v16, v19, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v18, v4 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v19, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_mul_hi_u32 v4, v18, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v13, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v15, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v11, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v6, v8 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v13, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v13, v4 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v18, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v13, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v13, v15, v14, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v21, v12, v[4:5] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v8 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v5, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v2, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v12, v4 +; GISEL-NEXT: v_xor_b32_e32 v14, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v4 +; GISEL-NEXT: v_mul_lo_u32 v3, v16, v4 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v11, v12, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v16, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 ; GISEL-NEXT: v_mul_lo_u32 v4, v5, v2 -; GISEL-NEXT: v_mul_hi_u32 v11, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v15, v2 +; GISEL-NEXT: v_mul_hi_u32 v12, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_xor_b32_e32 v11, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v2 +; GISEL-NEXT: v_mul_lo_u32 v12, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GISEL-NEXT: v_mul_hi_u32 v4, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v9, v14, v8 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, v[3:4] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v9, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v11, v8, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v12 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v12 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v8, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v12 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v6, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 @@ -2610,89 +2610,89 @@ ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v11, v9 ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v7, v15, v[1:2] ; GISEL-NEXT: v_lshl_b64 v[11:12], s[4:5], v6 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v12 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v0 ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v14, v[9:10] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v16, v9, vcc +; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v16, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v6, v5, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v13, v9, v10, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v12 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v6 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v12, v6, vcc ; GISEL-NEXT: v_xor_b32_e32 v11, v1, v6 ; GISEL-NEXT: v_xor_b32_e32 v12, v10, v6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v11 ; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v12 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v9 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v14 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v15, vcc ; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v10 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v9, vcc -; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v0, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v10, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v10 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v0 -; GISEL-NEXT: v_sub_i32_e32 v18, vcc, 0, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v21, v10 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v18, v16, 0 -; GISEL-NEXT: v_subb_u32_e32 v19, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v7 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v7 -; GISEL-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v18, v21, v[1:2] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v7 -; GISEL-NEXT: v_mul_lo_u32 v1, v21, v0 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v19, v16, v[9:10] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v20, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v16, v9 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v16, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v0, v21, v0 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] +; GISEL-NEXT: v_trunc_f32_e32 v18, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v18 +; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v20, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v15, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v7 -; GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v20, vcc +; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v11 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v18 +; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v10, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v20, v18, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v21, v19, v[9:10] +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v17, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v19, v9 +; GISEL-NEXT: v_mul_hi_u32 v16, v19, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v18, v9 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_lo_u32 v10, v21, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v10, v16, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v9, v21, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v19, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10 +; GISEL-NEXT: v_mul_hi_u32 v9, v18, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v21, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v9, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v10, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v7, v8 -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v13, v15, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v2, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v18, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v9, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v14, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v10, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v13, v8 +; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v21, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v5, v2, v13 ; GISEL-NEXT: v_mul_lo_u32 v2, v10, v0 ; GISEL-NEXT: v_mul_lo_u32 v14, v9, v4 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v7 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 ; GISEL-NEXT: v_mul_hi_u32 v3, v9, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v10, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 @@ -2716,32 +2716,32 @@ ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 ; GISEL-NEXT: v_mul_lo_u32 v4, v5, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v5, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_xor_b32_e32 v9, v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2 +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GISEL-NEXT: v_mul_hi_u32 v4, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v13, v15, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v13, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v8 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v10, v[3:4] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v9, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v9, v[3:4] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v8, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc ; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 @@ -2750,27 +2750,27 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v11 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v12 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v8, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v9 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v7, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v13, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -3301,90 +3301,90 @@ ; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v11 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v4, vcc ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v4, vcc ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v11, v4 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v11 +; GISEL-NEXT: v_trunc_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v10, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v12, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[4:5] +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, 0 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v10, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v15, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v9 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v17, v14, v[5:6] +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v11, v[0:1] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v14, v[2:3] -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v7, v1 -; GISEL-NEXT: v_mul_lo_u32 v1, v11, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v4 +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v0, v2, vcc ; GISEL-NEXT: v_mul_hi_u32 v0, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v11, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v2, v11, v2 +; GISEL-NEXT: v_mul_lo_u32 v1, v15, v5 +; GISEL-NEXT: v_mul_hi_u32 v3, v15, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GISEL-NEXT: v_mul_hi_u32 v2, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GISEL-NEXT: v_mul_hi_u32 v3, v15, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v11, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v4, 0 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v17 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v5, v[1:2] -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v18, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v15, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v3, 0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v18 ; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v4, v[1:2] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v17, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v7, v18, v11, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v5, v0 -; GISEL-NEXT: v_mul_lo_u32 v3, v4, v1 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v19, vcc +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v3, v[1:2] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v19, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v2, v4, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v4, v0 +; GISEL-NEXT: v_mul_hi_u32 v13, v3, v0 ; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v13, v5, v1 -; GISEL-NEXT: v_mul_hi_u32 v0, v5, v0 -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v4, v1 +; GISEL-NEXT: v_mul_lo_u32 v13, v4, v1 +; GISEL-NEXT: v_mul_hi_u32 v0, v4, v0 +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v7, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v3, v1 ; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v13, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v13, v3 -; GISEL-NEXT: v_mul_hi_u32 v1, v5, v1 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v13, v7 +; GISEL-NEXT: v_mul_hi_u32 v1, v4, v1 ; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v3, v2 +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v7, v2 ; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v2 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v4, v0 -; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v5, v1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v3, v0 +; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v4, v1, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v3, v9, v1 ; GISEL-NEXT: v_mul_hi_u32 v4, v9, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc ; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc @@ -3406,9 +3406,9 @@ ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, v[0:1] -; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v6 +; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v5 ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v7, v[3:4] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v6, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v3, vcc ; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v11, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1286,17 +1286,17 @@ ; GFX8-NEXT: s_add_u32 s6, s12, s16 ; GFX8-NEXT: s_mov_b32 s17, s16 ; GFX8-NEXT: s_addc_u32 s7, s13, s16 -; GFX8-NEXT: s_xor_b64 s[6:7], s[6:7], s[16:17] -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX8-NEXT: s_xor_b64 s[8:9], s[6:7], s[16:17] +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX8-NEXT: s_mov_b32 s3, s2 -; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] +; GFX8-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_u32 s12, 0, s6 -; GFX8-NEXT: s_subb_u32 s13, 0, s7 -; GFX8-NEXT: s_xor_b64 s[16:17], s[2:3], s[16:17] +; GFX8-NEXT: s_sub_u32 s6, 0, s8 +; GFX8-NEXT: s_subb_u32 s7, 0, s9 +; GFX8-NEXT: s_xor_b64 s[18:19], s[2:3], s[16:17] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 @@ -1304,10 +1304,12 @@ ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v4, v[1:2] +; GFX8-NEXT: s_ashr_i32 s16, s15, 31 +; GFX8-NEXT: s_mov_b32 s17, s16 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2] ; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v3, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 @@ -1330,15 +1332,14 @@ ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v4, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2] ; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: s_ashr_i32 s12, s15, 31 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v3, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: s_mov_b32 s13, s12 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 @@ -1358,64 +1359,64 @@ ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc -; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s9, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, s13, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, s12, v1 +; GFX8-NEXT: v_mul_hi_u32 v4, s12, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s13, v0 +; GFX8-NEXT: v_mul_hi_u32 v5, s13, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, s9, v1 +; GFX8-NEXT: v_mul_lo_u32 v4, s13, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mul_hi_u32 v3, s8, v1 +; GFX8-NEXT: v_mul_hi_u32 v3, s12, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v4, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v6, s9 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s8, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_ashr_i32 s8, s11, 31 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v6, s13 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s12, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: s_ashr_i32 s12, s11, 31 ; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v6 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s13, v1 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v7 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v6 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s6, v7 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 +; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s8, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] ; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc ; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4 ; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v1 ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] -; GFX8-NEXT: s_add_u32 s0, s10, s8 -; GFX8-NEXT: s_addc_u32 s1, s11, s8 -; GFX8-NEXT: s_add_u32 s10, s14, s12 -; GFX8-NEXT: s_addc_u32 s11, s15, s12 -; GFX8-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX8-NEXT: s_add_u32 s0, s10, s12 +; GFX8-NEXT: s_addc_u32 s1, s11, s12 +; GFX8-NEXT: s_add_u32 s10, s14, s16 +; GFX8-NEXT: s_addc_u32 s11, s15, s16 +; GFX8-NEXT: s_xor_b64 s[10:11], s[10:11], s[16:17] ; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s11 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc ; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s10 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s6, v8 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v8 ; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v5 @@ -1429,19 +1430,19 @@ ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v12 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v13, v0 -; GFX8-NEXT: s_mov_b32 s9, s8 -; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] +; GFX8-NEXT: s_mov_b32 s13, s12 +; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] ; GFX8-NEXT: s_sub_u32 s3, 0, s10 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v13, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v12 -; GFX8-NEXT: s_subb_u32 s18, 0, s11 +; GFX8-NEXT: s_subb_u32 s20, 0, s11 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v3, v10, vcc ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v5, v[1:2] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v15, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], s18, v13, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], s20, v13, v[1:2] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v9, v16, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc ; GFX8-NEXT: v_mul_lo_u32 v3, v5, v0 @@ -1469,13 +1470,13 @@ ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v13, v0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v8, 0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, s16, v4 +; GFX8-NEXT: v_xor_b32_e32 v1, s18, v4 ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v5, v[0:1] -; GFX8-NEXT: v_xor_b32_e32 v9, s17, v10 -; GFX8-NEXT: v_mov_b32_e32 v10, s17 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s18, v8, v[3:4] -; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v1 +; GFX8-NEXT: v_xor_b32_e32 v9, s19, v10 +; GFX8-NEXT: v_mov_b32_e32 v10, s19 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4] +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s18, v1 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v9, v10, vcc ; GFX8-NEXT: v_xor_b32_e32 v4, s2, v7 ; GFX8-NEXT: v_mul_lo_u32 v7, v5, v2 @@ -1503,37 +1504,37 @@ ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v10, s2 -; GFX8-NEXT: v_mul_lo_u32 v7, s7, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, s6, v3 +; GFX8-NEXT: v_mul_lo_u32 v7, s9, v2 +; GFX8-NEXT: v_mul_lo_u32 v8, s8, v3 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s2, v4 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc -; GFX8-NEXT: v_mul_hi_u32 v6, s6, v2 +; GFX8-NEXT: v_mul_hi_u32 v6, s8, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s7, v3 -; GFX8-NEXT: v_mul_hi_u32 v2, s7, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s9, v3 +; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, s6, v3 +; GFX8-NEXT: v_mul_hi_u32 v8, s8, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, s7, v3 +; GFX8-NEXT: v_mul_hi_u32 v9, s9, v3 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v8, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s10, v9, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v10, s7 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, s9 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s11, v8, v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v10, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s7, v6 +; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s9, v6 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 @@ -1543,40 +1544,39 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1] ; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s10, v2 ; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc +; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v8 +; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v9, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v8 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc -; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1] -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v14 -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s10, v11 -; GFX8-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v12 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s10, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13 +; GFX8-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1] +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v9, v14, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v2, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX8-NEXT: s_xor_b64 s[0:1], s[8:9], s[12:13] -; GFX8-NEXT: v_xor_b32_e32 v2, s0, v6 -; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8 -; GFX8-NEXT: v_mov_b32_e32 v6, s1 +; GFX8-NEXT: s_xor_b64 s[0:1], s[12:13], s[16:17] +; GFX8-NEXT: v_xor_b32_e32 v2, s0, v8 +; GFX8-NEXT: v_xor_b32_e32 v3, s1, v9 +; GFX8-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc -; GFX8-NEXT: v_xor_b32_e32 v6, s8, v9 -; GFX8-NEXT: v_xor_b32_e32 v7, s8, v7 -; GFX8-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s8, v6 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; GFX8-NEXT: v_xor_b32_e32 v6, s12, v6 +; GFX8-NEXT: v_xor_b32_e32 v7, s12, v7 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s12, v6 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v8, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, s5 @@ -1599,17 +1599,17 @@ ; GFX9-NEXT: s_add_u32 s6, s12, s16 ; GFX9-NEXT: s_mov_b32 s17, s16 ; GFX9-NEXT: s_addc_u32 s7, s13, s16 -; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[16:17] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: s_xor_b64 s[8:9], s[6:7], s[16:17] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] +; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3] ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_u32 s12, 0, s6 -; GFX9-NEXT: s_subb_u32 s13, 0, s7 -; GFX9-NEXT: s_xor_b64 s[16:17], s[2:3], s[16:17] +; GFX9-NEXT: s_sub_u32 s6, 0, s8 +; GFX9-NEXT: s_subb_u32 s7, 0, s9 +; GFX9-NEXT: s_xor_b64 s[18:19], s[2:3], s[16:17] ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -1617,10 +1617,12 @@ ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v4, v[1:2] +; GFX9-NEXT: s_ashr_i32 s16, s15, 31 +; GFX9-NEXT: s_mov_b32 s17, s16 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v3, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 @@ -1642,15 +1644,15 @@ ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v4, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: s_ashr_i32 s12, s15, 31 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v3, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: s_mov_b32 s13, s12 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 @@ -1669,226 +1671,225 @@ ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s9, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0 +; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 +; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v5, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s9 -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s8, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: s_ashr_i32 s8, s11, 31 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX9-NEXT: v_sub_u32_e32 v0, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-NEXT: v_add3_u32 v4, v3, v0, v6 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s12, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s9, v5, v[2:3] +; GFX9-NEXT: s_ashr_i32 s12, s11, 31 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v1, s13, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s6, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] -; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v1 -; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s10, s8 -; GFX9-NEXT: s_addc_u32 s1, s11, s8 -; GFX9-NEXT: s_add_u32 s10, s14, s12 -; GFX9-NEXT: s_addc_u32 s11, s15, s12 -; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] -; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s11 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s10 -; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s6, v8 -; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v0, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v12, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v12 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] +; GFX9-NEXT: s_add_u32 s0, s10, s12 +; GFX9-NEXT: s_addc_u32 s1, s11, s12 +; GFX9-NEXT: s_add_u32 s10, s14, s16 +; GFX9-NEXT: s_addc_u32 s11, s15, s16 +; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[16:17] +; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s11 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s10 +; GFX9-NEXT: v_subrev_co_u32_e32 v16, vcc, s8, v9 +; GFX9-NEXT: v_subbrev_co_u32_e32 v17, vcc, 0, v1, vcc +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v15 +; GFX9-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; GFX9-NEXT: v_trunc_f32_e32 v13, v2 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v13 +; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v1 +; GFX9-NEXT: s_mov_b32 s13, s12 +; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] ; GFX9-NEXT: s_sub_u32 s3, 0, s10 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v13, 0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v14, 0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_subb_u32 s14, 0, s11 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v12, v[1:2] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v10, vcc -; GFX9-NEXT: v_mul_hi_u32 v10, v13, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v13, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v12, v0 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v13, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v11, vcc +; GFX9-NEXT: v_mul_hi_u32 v11, v14, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v14, v[2:3] ; GFX9-NEXT: v_mul_lo_u32 v3, v13, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v15, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v12 +; GFX9-NEXT: v_mul_lo_u32 v4, v14, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v0, v12, v0 -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v2, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v2, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v10, v12, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v13, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v12, v1 -; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], v10, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 +; GFX9-NEXT: v_mul_lo_u32 v11, v13, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, v14, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v13, v2 +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v11, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v3 +; GFX9-NEXT: v_add_u32_e32 v4, v11, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v10, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], v13, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], v12, v1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v10, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX9-NEXT: v_xor_b32_e32 v8, s16, v4 -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v11, v[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; GFX9-NEXT: v_xor_b32_e32 v5, s17, v5 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v10, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v9, vcc -; GFX9-NEXT: v_xor_b32_e32 v4, s2, v7 -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2 +; GFX9-NEXT: v_add3_u32 v2, v4, v3, v2 +; GFX9-NEXT: v_add_co_u32_e64 v11, s[0:1], v14, v1 +; GFX9-NEXT: v_addc_co_u32_e64 v12, s[0:1], v13, v2, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v11, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; GFX9-NEXT: v_xor_b32_e32 v9, s18, v5 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v12, v[1:2] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; GFX9-NEXT: v_xor_b32_e32 v7, s19, v7 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s14, v11, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v10, s19 +; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s18, v9 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v10, vcc +; GFX9-NEXT: v_xor_b32_e32 v5, s2, v8 +; GFX9-NEXT: v_mul_lo_u32 v7, v12, v3 +; GFX9-NEXT: v_mul_lo_u32 v8, v11, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, v11, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, v12, v3 ; GFX9-NEXT: v_xor_b32_e32 v6, s2, v6 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_mul_lo_u32 v9, v12, v4 ; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s7, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, s6, v3 -; GFX9-NEXT: v_mul_hi_u32 v9, s6, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s7, v2 -; GFX9-NEXT: v_mul_hi_u32 v12, s7, v3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, s7, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, s6, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 +; GFX9-NEXT: v_mul_hi_u32 v8, v11, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v9, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v10, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s2, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v9, v7 -; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s10, v8, v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v9, s7 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s6, v2 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s11, v10, v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v7 -; GFX9-NEXT: v_sub_u32_e32 v6, s7, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s10, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v10 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 1, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s10, v11 -; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], s[12:13] -; GFX9-NEXT: v_xor_b32_e32 v2, s0, v6 -; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v6, s8, v9 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_xor_b32_e32 v7, s8, v7 -; GFX9-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s8, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v8, vcc +; GFX9-NEXT: v_add3_u32 v4, v8, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v12, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, s9, v3 +; GFX9-NEXT: v_mul_lo_u32 v8, s8, v4 +; GFX9-NEXT: v_mul_hi_u32 v10, s8, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s9, v3 +; GFX9-NEXT: v_mul_hi_u32 v12, s9, v4 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, s9, v4 +; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 +; GFX9-NEXT: v_mul_hi_u32 v8, s8, v4 +; GFX9-NEXT: v_mov_b32_e32 v9, s2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v3, v7 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s10, v11, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s2, v5 +; GFX9-NEXT: v_add_u32_e32 v8, v10, v8 +; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v9, vcc +; GFX9-NEXT: v_add3_u32 v9, v8, v7, v12 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s10, v9, v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v10, s9 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s8, v3 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s11, v11, v[7:8] +; GFX9-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v10, v7, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8 +; GFX9-NEXT: v_sub_u32_e32 v7, s9, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s10, v3 +; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v11 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v12 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v13 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s10, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14 +; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v18, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v10, v11, v14, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v3, v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v4, s[0:1] +; GFX9-NEXT: s_xor_b64 s[0:1], s[12:13], s[16:17] +; GFX9-NEXT: v_xor_b32_e32 v3, s0, v10 +; GFX9-NEXT: v_xor_b32_e32 v4, s1, v9 +; GFX9-NEXT: v_mov_b32_e32 v9, s1 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v9, vcc +; GFX9-NEXT: v_xor_b32_e32 v7, s12, v7 +; GFX9-NEXT: v_xor_b32_e32 v8, s12, v8 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s12, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v9, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[4:5] -; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -447,6 +447,7 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v11, v1 ; GISEL-NEXT: v_mul_hi_u32 v12, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 @@ -459,136 +460,134 @@ ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v1 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v1, v0 -; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v6, v12 -; GISEL-NEXT: v_xor_b32_e32 v12, v7, v12 -; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v13 -; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v12 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v9, 0 -; GISEL-NEXT: v_mac_f32_e32 v15, 0x4f800000, v16 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v10, v[1:2] -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v15 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v13 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v9, v[6:7] -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v7, v7 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v12, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v7, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v14, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1] -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v14, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v7, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v11, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v8 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v19, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v18, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 -; GISEL-NEXT: v_mul_hi_u32 v0, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v9 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v12, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v1 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v9, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v7, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v17, v9, v[5:6] -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v1, v11, v4 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v7, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v9, v5 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v7, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v12, v[9:10] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v11, v0 +; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v14, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v8 +; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v0, v8, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v9, v1, v9, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v0 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v0, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v1, v0 +; GISEL-NEXT: v_xor_b32_e32 v14, v6, v0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v13 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v14 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v10, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v12, vcc +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v8 +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v17, v6, v1, s[4:5] +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v6, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v0 +; GISEL-NEXT: v_sub_i32_e64 v19, s[4:5], 0, v13 +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, v14, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v18, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v21, v6 +; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v12, v8, vcc +; GISEL-NEXT: v_mul_hi_u32 v12, v18, v0 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v19, v21, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v20, v18, v[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v15, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v21, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v18, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v21, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v21, v5 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GISEL-NEXT: v_mul_hi_u32 v7, v18, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v21, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v0 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v21, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v7, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v10, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v19, v12, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v20, v7, v[5:6] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 +; GISEL-NEXT: v_xor_b32_e32 v11, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v7, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v12, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_xor_b32_e32 v7, v8, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v10, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v11, v2 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v8, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 @@ -597,38 +596,38 @@ ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v5, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v8, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v12 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v8, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v10, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v11, v3 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v12 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v14 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v2, v13 ; GISEL-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v12 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v12 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v6, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v14 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v6, v13 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64: @@ -2546,185 +2545,183 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v14, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v9 -; GISEL-NEXT: v_lshl_b64 v[0:1], s[4:5], v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v14, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v14, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v9, v8 -; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v6, 0 -; GISEL-NEXT: v_xor_b32_e32 v14, v0, v11 -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v11 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v14 -; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v15 -; GISEL-NEXT: v_mov_b32_e32 v0, v9 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v10, v[0:1] -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v16 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v6, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v14 -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v9 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v6, v6 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v15, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v16, 0 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v6, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v6, v9 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v13, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v16, v[10:11] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v16, v10 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v7 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v12, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v11, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v8, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v7 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v7 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v1, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v16, v10 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_mul_hi_u32 v10, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v5, v8, v[1:2] +; GISEL-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v14, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v0 +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v13, v8, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v13, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v8, v1, v6, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v0, v7, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v11, v0, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v1, v0 +; GISEL-NEXT: v_xor_b32_e32 v10, v10, v0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v11 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v9, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v6, vcc +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v1, s[4:5] +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v16, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v16 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v11 +; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v10, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v17, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v6, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v18, v16, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v19, v17, v[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v20, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v17, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v14, v20, vcc +; GISEL-NEXT: v_mul_hi_u32 v14, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v6, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v17, v10, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v14, v16, v5 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GISEL-NEXT: v_mul_hi_u32 v7, v17, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v16, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v17, v0 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v16, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v7, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v9, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v18, v14, v[1:2] ; GISEL-NEXT: v_xor_b32_e32 v1, v8, v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v18, v9, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v19, v7, v[5:6] +; GISEL-NEXT: v_cndmask_b32_e32 v9, v12, v13, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v8 -; GISEL-NEXT: v_mul_lo_u32 v2, v10, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v9, v5 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v8 -; GISEL-NEXT: v_mul_hi_u32 v3, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v10, v0 +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v8 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v10, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v9, 0 +; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v5, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v5, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v9, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v15 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v14 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v15 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v15, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v10 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v2, v14 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v2, v11 ; GISEL-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v15 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v15, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v6, v14 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v11 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v10 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v6, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v12, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc @@ -3213,131 +3210,131 @@ ; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc ; GISEL-NEXT: v_mul_lo_u32 v5, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v7, v10, v4 -; GISEL-NEXT: v_and_b32_e32 v8, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v8, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 +; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_mul_hi_u32 v9, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v4 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, 0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 0, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1] -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v7, v[0:1] +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[7:8] ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v10, v4 -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v12 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v11, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v11, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v0 +; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v10 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v4, v5, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v6 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v11, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v11, v5 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v7, v1 +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v2, vcc ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v6, v4 ; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v0 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v8 -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, v9, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v6 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v0 +; GISEL-NEXT: v_sub_i32_e64 v16, s[4:5], 0, v9 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v10, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v15, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v3 ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v15, v[0:1] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v18, v[0:1] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v12, v[5:6] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v6, v16, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v0, v15, v4 -; GISEL-NEXT: v_mul_lo_u32 v16, v12, v5 -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v4 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v3, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v17 -; GISEL-NEXT: v_sub_i32_e64 v17, s[4:5], v7, v1 -; GISEL-NEXT: v_subbrev_u32_e64 v18, s[6:7], 0, v11, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v17, v1 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v18, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v19, v0, v19, s[6:7] -; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], v11, v3, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v4, v15, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v17, v15, v[5:6] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v19, v0, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v0, v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v19, v15, v5 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v2, v3, vcc +; GISEL-NEXT: v_mul_hi_u32 v2, v15, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v16, v3 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_and_b32_e32 v16, 0xffffff, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v15, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v18, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v18, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v15, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v11, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v2 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v15, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v4, 0 -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v17, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v0, vcc +; GISEL-NEXT: v_mul_hi_u32 v4, v18, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v18, v2, vcc +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v4, 0 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v13, v1 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v5, v[0:1] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v17, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v11, v18, v12, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v5, v[0:1] +; GISEL-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v20, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v4, v[0:1] ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v14, v18, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v5, v2 -; GISEL-NEXT: v_mul_lo_u32 v6, v4, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v4, v0 ; GISEL-NEXT: v_mul_hi_u32 v13, v4, v2 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], 0, v16 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], 0, v12 ; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v3, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v13, v5, v0 ; GISEL-NEXT: v_mul_hi_u32 v2, v5, v2 -; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v6, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v4, v0 +; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v7, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v4, v0 ; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v13, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6 +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v13, v7 ; GISEL-NEXT: v_mul_hi_u32 v0, v5, v0 ; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v6, v3 +; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v7, v3 ; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v3 ; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v4, v2 ; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v5, v0, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v3, v12, v2 -; GISEL-NEXT: v_mul_lo_u32 v4, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, v11, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc +; GISEL-NEXT: v_mul_hi_u32 v6, v11, v2 ; GISEL-NEXT: v_mul_hi_u32 v2, v12, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc @@ -3345,7 +3342,7 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v6, v12, v0 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -3353,38 +3350,38 @@ ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v2, v3 ; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v6, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v4 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v4, v[0:1] ; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v6, v[3:4] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc ; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v10 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v2, v8 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v2, v9 ; GISEL-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v8 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v9 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v9 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v6, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v10 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v6, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -364,256 +364,256 @@ ; GISEL-LABEL: v_udiv_v2i64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 -; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v5 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc +; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v7, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 +; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 +; GISEL-NEXT: v_trunc_f32_e32 v11, v11 +; GISEL-NEXT: v_trunc_f32_e32 v12, v12 +; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v15, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 +; GISEL-NEXT: v_mul_lo_u32 v19, v8, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v8, v9 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 +; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 +; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 +; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 +; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 +; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 +; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 +; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 +; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] +; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] +; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 +; GISEL-NEXT: v_mul_lo_u32 v20, v8, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 +; GISEL-NEXT: v_mul_hi_u32 v18, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 +; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 +; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v9, v8 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v11, v8 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 +; GISEL-NEXT: v_mul_hi_u32 v15, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 +; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 +; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 +; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] +; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v13, v4, v8 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v11, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, v12, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v8 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v11 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v11, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v7 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v4, v11 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v8, vcc +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v13, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v6, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v6, v4 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v9, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, v9, v10, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v4 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v16, v4, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v18, v4, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v20, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v9 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v16 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 1, v10 +; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v19 +; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], 1, v15 +; GISEL-NEXT: v_add_i32_e64 v8, s[12:13], v8, v13 +; GISEL-NEXT: v_add_i32_e64 v13, s[12:13], 1, v16 +; GISEL-NEXT: v_add_i32_e64 v11, s[14:15], v11, v12 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v6 +; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v4 +; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v4 +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, v6, v11 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v10, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v6, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], 0, v11, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[14:15] +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v17, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[16:17] +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v20, v4 +; GISEL-NEXT: v_addc_u32_e64 v20, s[6:7], 0, v0, s[10:11] +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v12, v18 +; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], 0, v2, s[12:13] +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v14 +; GISEL-NEXT: v_subb_u32_e64 v14, s[6:7], v1, v12, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v12 +; GISEL-NEXT: v_subb_u32_e64 v12, s[6:7], v3, v4, s[8:9] +; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[22:23] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[8:9] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[18:19] +; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[20:21] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, v17, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[8:9] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v1, v15, v19, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v16, v13, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v20, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v2, v18, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v5, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64: @@ -1249,256 +1249,256 @@ ; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 ; GISEL-NEXT: v_lshl_b64 v[7:8], s[4:5], v4 ; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 -; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v8 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v5 +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v8, vcc +; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v5, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 +; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 +; GISEL-NEXT: v_trunc_f32_e32 v11, v11 +; GISEL-NEXT: v_trunc_f32_e32 v12, v12 +; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v7 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6 -; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v15, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v15, v6, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v6, v12 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v6, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 +; GISEL-NEXT: v_mul_lo_u32 v19, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v6, v9 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 +; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 +; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 +; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 +; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 +; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 +; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 +; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 +; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] +; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] +; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 +; GISEL-NEXT: v_mul_lo_u32 v20, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 +; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v11 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 +; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v9, v6 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v11, v6 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 +; GISEL-NEXT: v_mul_hi_u32 v15, v9, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v11, v6 +; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 +; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 +; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 +; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] +; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v7, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v11, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, v12, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v7 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v6 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v11 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v11, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 -; GISEL-NEXT: v_trunc_f32_e32 v7, v7 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v11 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v14 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v11, v6, vcc +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8 -; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6 -; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v6 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v9, v5, v6 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v6 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v9, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v8, v9, v10, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v4 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v6 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v7, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v16, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v18, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v20, v5, v10 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v9 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v16 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 1, v10 +; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v19 +; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], 1, v15 +; GISEL-NEXT: v_add_i32_e64 v6, s[12:13], v6, v13 +; GISEL-NEXT: v_add_i32_e64 v13, s[12:13], 1, v16 +; GISEL-NEXT: v_add_i32_e64 v11, s[14:15], v11, v12 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v4 +; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v7 +; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v7 +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v4, v11 +; GISEL-NEXT: v_add_i32_e64 v4, s[24:25], v17, v12 +; GISEL-NEXT: v_addc_u32_e64 v7, s[6:7], 0, v11, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], v20, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[14:15] +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v18 +; GISEL-NEXT: v_subb_u32_e64 v17, s[6:7], v1, v4, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[16:17] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v17, v8 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[14:15], v17, v8 +; GISEL-NEXT: v_addc_u32_e64 v17, s[10:11], 0, v0, s[10:11] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v8, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[18:19] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v8 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v1, v8 +; GISEL-NEXT: v_addc_u32_e64 v1, s[12:13], 0, v7, s[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[22:23] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 +; GISEL-NEXT: v_subb_u32_e64 v14, vcc, v3, v2, s[8:9] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v14, v5 +; GISEL-NEXT: v_subb_u32_e64 v2, s[8:9], v2, v5, s[8:9] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v2, vcc, 0, v2, s[20:21] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[14:15] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v14, v4, s[8:9] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v18, s[6:7] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v15, v19, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v16, v13, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v17, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v5, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_pow2_shl_denom: @@ -1900,259 +1900,259 @@ ; GISEL-LABEL: v_udiv_v2i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v1 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v8 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 -; GISEL-NEXT: v_trunc_f32_e32 v8, v8 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 +; GISEL-NEXT: s_bfe_i32 s12, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s13, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s14, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s15, -1, 0x10000 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v0 +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 0, v0 +; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v7 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v7, v7 ; GISEL-NEXT: v_trunc_f32_e32 v11, v11 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 -; GISEL-NEXT: v_mul_lo_u32 v15, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v12 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v4, v6, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v11 +; GISEL-NEXT: v_mul_lo_u32 v14, v6, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v12 +; GISEL-NEXT: v_mul_lo_u32 v17, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v18, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v19, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; GISEL-NEXT: v_mul_lo_u32 v15, v7, v14 +; GISEL-NEXT: v_mul_hi_u32 v20, v12, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 +; GISEL-NEXT: v_mul_lo_u32 v18, v11, v17 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v4, v16 +; GISEL-NEXT: v_mul_hi_u32 v4, v5, v17 ; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 -; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v6, v16 -; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v5, v13 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v4, v7, v16 ; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v6, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v17 -; GISEL-NEXT: v_mul_hi_u32 v17, v7, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v6, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6 -; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 -; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s6, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s7, -1, 0x10000 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v18, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_mov_b32_e32 v15, s4 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v20, v18 -; GISEL-NEXT: v_mov_b32_e32 v19, s5 -; GISEL-NEXT: v_mul_hi_u32 v16, v8, v16 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; GISEL-NEXT: v_mov_b32_e32 v16, s6 -; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v2 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v2 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v2, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v15, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v11, v13 +; GISEL-NEXT: v_add_i32_e64 v4, s[8:9], v4, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v5, v13 +; GISEL-NEXT: v_add_i32_e64 v17, s[10:11], v19, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[10:11] +; GISEL-NEXT: v_add_i32_e64 v14, s[8:9], v17, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_mul_lo_u32 v10, v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v4 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v11, v9 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v12, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v20, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v20 +; GISEL-NEXT: v_mov_b32_e32 v20, s12 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v4, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, s13 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 +; GISEL-NEXT: v_mov_b32_e32 v19, s14 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v12, v15 +; GISEL-NEXT: v_mov_b32_e32 v15, s15 +; GISEL-NEXT: v_mul_hi_u32 v16, v7, v16 +; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 +; GISEL-NEXT: v_add_i32_e64 v5, s[8:9], v5, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_mul_hi_u32 v17, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; GISEL-NEXT: v_mov_b32_e32 v18, s7 -; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v18, v6, v12 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; GISEL-NEXT: v_mul_lo_u32 v17, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v5 +; GISEL-NEXT: v_addc_u32_e64 v7, vcc, v7, v14, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v5 +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v13, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v13, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_mul_hi_u32 v9, v5, v17 +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v18 +; GISEL-NEXT: v_mul_hi_u32 v18, v7, v18 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; GISEL-NEXT: v_mul_lo_u32 v14, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_mul_lo_u32 v14, v7, v6 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v5, v8 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v11, v8 +; GISEL-NEXT: v_add_i32_e64 v13, s[8:9], v16, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 +; GISEL-NEXT: v_add_i32_e64 v17, s[10:11], v18, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v14, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[10:11] +; GISEL-NEXT: v_add_i32_e64 v16, s[8:9], v17, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v2 -; GISEL-NEXT: v_mul_hi_u32 v8, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v0, v7 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v16 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v14, v17 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v3, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, 0, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v9 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v12 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc +; GISEL-NEXT: v_addc_u32_e64 v7, vcc, v11, v8, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v18, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v6, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v17, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_mul_lo_u32 v10, 0, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, 0, v9 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v16, v12 +; GISEL-NEXT: v_add_i32_e64 v5, s[8:9], v18, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v17 +; GISEL-NEXT: v_add_i32_e64 v5, s[8:9], v5, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v13, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v1, v2 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v2 -; GISEL-NEXT: v_mul_hi_u32 v13, v1, v2 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_mul_lo_u32 v11, v3, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v17, v3, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v10 -; GISEL-NEXT: v_mul_lo_u32 v9, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v3, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, v9, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], 1, v7 -; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v5, s[6:7] -; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v0, v11 -; GISEL-NEXT: v_subb_u32_e64 v11, s[8:9], 0, v10, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v13, v15, v13, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v16, v15, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], 1, v12 -; GISEL-NEXT: v_addc_u32_e64 v16, s[4:5], 0, v14, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 0, v10 -; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[6:7] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v3 -; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 -; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 1, v8 -; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], 0, v17, s[6:7] -; GISEL-NEXT: v_sub_i32_e64 v9, s[6:7], 0, v9 -; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v15, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, v8, v0, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v16, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v6, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, 0, v9 +; GISEL-NEXT: v_mul_hi_u32 v16, v1, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v18, 0, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v9 +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v13 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], 1, v5 +; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v17 +; GISEL-NEXT: v_add_i32_e64 v17, s[10:11], 1, v12 +; GISEL-NEXT: v_add_i32_e64 v6, s[12:13], v6, v10 +; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], 1, v13 +; GISEL-NEXT: v_add_i32_e64 v7, s[14:15], v7, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v3, v1 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v0 +; GISEL-NEXT: v_sub_i32_e64 v3, s[18:19], v3, v1 +; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v1, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v3, v1 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v0, v7 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 +; GISEL-NEXT: v_addc_u32_e64 v0, s[6:7], 0, v7, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[14:15] +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v14, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[16:17] +; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], v18, v3 +; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], 0, v1, s[10:11] +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v16 +; GISEL-NEXT: v_addc_u32_e64 v16, s[6:7], 0, v0, s[12:13] +; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], v3, v11 +; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v8, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v11 +; GISEL-NEXT: v_subb_u32_e64 v11, s[10:11], 0, v3, s[8:9] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[22:23] +; GISEL-NEXT: v_cndmask_b32_e64 v2, v20, v2, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v8 +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v3 +; GISEL-NEXT: v_subbrev_u32_e64 v8, vcc, 0, v8, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v14, v19, v14, s[10:11] +; GISEL-NEXT: v_subbrev_u32_e64 v8, vcc, 0, v8, s[18:19] +; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[20:21] +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v15, v20, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v13, v10, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v16, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v0, v9, v2, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v4, s[8:9] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -1092,96 +1092,96 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v1, v2, s[0:1] ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s14 ; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v0, v3, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v7 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v4, vcc +; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v5 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 +; GFX8-NEXT: v_trunc_f32_e32 v14, v1 +; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v14 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v0 -; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v5 -; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v6, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v12, 0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v0 +; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v6, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v15, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v12, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v2, v15, v0 -; GFX8-NEXT: v_mul_lo_u32 v17, v12, v1 -; GFX8-NEXT: v_mul_hi_u32 v4, v12, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v15, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v17 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v16, v2, v16, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v14, v[1:2] +; GFX8-NEXT: v_add_u32_e64 v17, s[0:1], 1, v12 +; GFX8-NEXT: v_addc_u32_e64 v18, s[0:1], 0, v13, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v15, v[1:2] +; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v4, v3, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v14, v0 ; GFX8-NEXT: v_mul_lo_u32 v4, v15, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v17, v2 -; GFX8-NEXT: v_mul_hi_u32 v17, v12, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v17 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v17 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v13 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc ; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10 -; GFX8-NEXT: v_mul_hi_u32 v1, v15, v1 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v2, vcc +; GFX8-NEXT: v_mul_hi_u32 v2, v15, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v14, v1 +; GFX8-NEXT: v_mul_hi_u32 v0, v14, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; GFX8-NEXT: v_mul_hi_u32 v4, v15, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_mul_hi_u32 v1, v14, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v0 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v15, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v13, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v12, v17, vcc ; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v14, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v18, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4] +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v15, v[3:4] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v16 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v19, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v6, v15, v2 -; GFX8-NEXT: v_mul_lo_u32 v9, v12, v3 +; GFX8-NEXT: v_mul_lo_u32 v6, v14, v2 +; GFX8-NEXT: v_mul_lo_u32 v9, v15, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GFX8-NEXT: v_mul_hi_u32 v7, v12, v2 +; GFX8-NEXT: v_mul_hi_u32 v7, v15, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v11, v20, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v6, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v6, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v7, v15, v3 -; GFX8-NEXT: v_mul_hi_u32 v2, v15, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, v14, v3 +; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2 ; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v9, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, v12, v3 +; GFX8-NEXT: v_mul_hi_u32 v9, v15, v3 ; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v7, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v2, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v9 -; GFX8-NEXT: v_mul_hi_u32 v3, v15, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, v14, v3 ; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v7, v6 ; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v6 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v12, v2 -; GFX8-NEXT: v_addc_u32_e64 v3, s[0:1], v15, v3, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v15, v2 +; GFX8-NEXT: v_addc_u32_e64 v3, s[0:1], v14, v3, s[0:1] ; GFX8-NEXT: v_mul_lo_u32 v6, s11, v2 ; GFX8-NEXT: v_mul_lo_u32 v7, s10, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc @@ -1221,27 +1221,27 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[0:1] ; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s14, v7 ; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v2, vcc +; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v8 +; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v9, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v8 ; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc -; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1] -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v14 -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s14, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc -; GFX8-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12 +; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s14, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13 +; GFX8-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v2, vcc +; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1] +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v16, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v8, v12, v14, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v15, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v14, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v18, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v8, v12, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v8, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v9, s5 @@ -1298,6 +1298,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: s_sub_u32 s2, 0, s14 @@ -1328,7 +1329,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s9, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s9, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -1340,178 +1341,178 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v6, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v6, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v6, v[1:2] +; GFX9-NEXT: v_add3_u32 v8, v3, v0, v5 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v8, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s8, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v5, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v4, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v8 -; GFX9-NEXT: v_sub_u32_e32 v0, s9, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 +; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s8, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v6, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v4, v2, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v10 +; GFX9-NEXT: v_sub_u32_e32 v1, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v1, v2, s[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s14 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v0, v3, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s12, v7 -; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v0 -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v6, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v12, 0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v2 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v15, v[1:2] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v2, v3, s[0:1] +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s15 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v7, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s14 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s12, v9 +; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], 1, v6 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; GFX9-NEXT: v_trunc_f32_e32 v15, v2 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v15 +; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v1 +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v12, v[1:2] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v2, v15, v0 -; GFX9-NEXT: v_mul_lo_u32 v17, v12, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, v12, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v15, v0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v17 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v16, 0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v17, v3, v17, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v19, s[0:1], 0, v14, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v16, v[2:3] +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, v15, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v17, v2 -; GFX9-NEXT: v_mul_hi_u32 v17, v12, v1 +; GFX9-NEXT: v_mul_lo_u32 v7, v16, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s12, v12 +; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v3, vcc +; GFX9-NEXT: v_mul_hi_u32 v3, v16, v1 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2 ; GFX9-NEXT: v_mul_hi_u32 v1, v15, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX9-NEXT: v_add_u32_e32 v3, v7, v3 +; GFX9-NEXT: v_mul_hi_u32 v7, v16, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v4, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v17 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v4, v4, v17 -; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13 -; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v14, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s12, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v0 -; GFX9-NEXT: v_add3_u32 v1, v4, v2, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v1, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v17, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v13, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v16 -; GFX9-NEXT: v_mul_lo_u32 v5, v15, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, v12, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v19, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v10, v12, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, v20, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v16, v1 +; GFX9-NEXT: v_add3_u32 v2, v4, v3, v2 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v7, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v2, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v18, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[1:2] +; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v19, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v7, v[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v14, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v17 +; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3 +; GFX9-NEXT: v_mul_lo_u32 v8, v7, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v20, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v12, v7, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v13, v21, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v10, v15, v3 -; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mul_hi_u32 v6, v12, v3 +; GFX9-NEXT: v_mul_lo_u32 v12, v15, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v10, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v2, v6 +; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 +; GFX9-NEXT: v_mul_hi_u32 v8, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v12, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v6 +; GFX9-NEXT: v_add_u32_e32 v8, v12, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v2, v5 -; GFX9-NEXT: v_add_u32_e32 v6, v10, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; GFX9-NEXT: v_add3_u32 v3, v6, v5, v3 -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v12, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], v15, v3, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v5, s11, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, s10, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GFX9-NEXT: v_mul_hi_u32 v7, s10, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s11, v2 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v7, s11, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mul_hi_u32 v6, s10, v3 -; GFX9-NEXT: v_mul_hi_u32 v12, s11, v3 -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v7, v2 +; GFX9-NEXT: v_add3_u32 v4, v8, v6, v4 +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v7, v3 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], v15, v4, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v6, s11, v3 +; GFX9-NEXT: v_mul_lo_u32 v7, s10, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, s10, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s11, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v2, v6 +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v10, 0 +; GFX9-NEXT: v_mul_lo_u32 v8, s11, v4 ; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s14, v8, v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v9, s11 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s15, v10, v[6:7] -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v2 -; GFX9-NEXT: v_subb_co_u32_e64 v9, s[0:1], v9, v6, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v9 -; GFX9-NEXT: v_sub_u32_e32 v2, s11, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s14, v7 -; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v2, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v10 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 1, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s14, v11 -; GFX9-NEXT: v_subbrev_co_u32_e64 v16, s[0:1], 0, v2, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v14, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v11, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v8, v12, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v7, s10, v4 +; GFX9-NEXT: v_mul_hi_u32 v13, s11, v4 +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v8, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v3, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0 +; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc +; GFX9-NEXT: v_add3_u32 v10, v7, v12, v13 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v11, s11 +; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8] +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s10, v3 +; GFX9-NEXT: v_subb_co_u32_e64 v11, s[0:1], v11, v7, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v11 +; GFX9-NEXT: v_sub_u32_e32 v3, s11, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v11 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s14, v8 +; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v10, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v12 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v13 +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s14, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14 +; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v14, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v15, v18, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v13, v20, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v9, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[4:5] -; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -358,254 +358,254 @@ ; GISEL-LABEL: v_urem_v2i64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 -; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v5 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc +; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v7, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 +; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 +; GISEL-NEXT: v_trunc_f32_e32 v11, v11 +; GISEL-NEXT: v_trunc_f32_e32 v12, v12 +; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v15, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 +; GISEL-NEXT: v_mul_lo_u32 v19, v8, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v8, v9 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 +; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 +; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 +; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 +; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 +; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 +; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 +; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 +; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] +; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] +; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 +; GISEL-NEXT: v_mul_lo_u32 v20, v8, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 +; GISEL-NEXT: v_mul_hi_u32 v18, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 +; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 +; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v9, v8 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v11, v8 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 +; GISEL-NEXT: v_mul_hi_u32 v15, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 +; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 +; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 +; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] +; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v4, v8 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v11, vcc, 0, v1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v5, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v7 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v4, v11 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v8, vcc +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v13, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v6, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v6, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v3, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v16, v4, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 +; GISEL-NEXT: v_mul_lo_u32 v18, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v13 +; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v6 +; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v4 +; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v6 +; GISEL-NEXT: v_sub_i32_e64 v4, s[14:15], v12, v4 +; GISEL-NEXT: v_sub_i32_e64 v6, s[16:17], v13, v6 +; GISEL-NEXT: v_add_i32_e64 v8, s[18:19], v17, v8 +; GISEL-NEXT: v_add_i32_e64 v11, s[18:19], v19, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v9 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v11, v10 +; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v8, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v8 +; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], v3, v9, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v5 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v7 ; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v8, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v18, vcc, 0, v1, s[10:11] +; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[10:11] +; GISEL-NEXT: v_subbrev_u32_e64 v19, vcc, 0, v3, s[12:13] +; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v7, s[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[14:15] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v19, v7 +; GISEL-NEXT: v_subbrev_u32_e64 v3, s[6:7], 0, v3, s[16:17] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v19, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[8:9] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v6, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v1, v18, v1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64: @@ -1101,28 +1101,30 @@ ; GISEL-LABEL: v_urem_v2i64_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb +; GISEL-NEXT: s_mov_b32 s4, 0x12d8fb ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_cvt_f32_u32_e32 v7, 0x12d8fb ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb +; GISEL-NEXT: s_sub_u32 s5, 0, 0x12d8fb ; GISEL-NEXT: v_madmk_f32 v6, v5, 0x4f800000, v7 -; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: s_bfe_i32 s7, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s8, -1, 0x10000 ; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GISEL-NEXT: v_mov_b32_e32 v6, s4 -; GISEL-NEXT: v_mov_b32_e32 v5, s5 +; GISEL-NEXT: v_mov_b32_e32 v6, s7 +; GISEL-NEXT: v_mov_b32_e32 v5, s8 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GISEL-NEXT: s_sub_u32 s9, 0, 0x12d8fb +; GISEL-NEXT: s_sub_u32 s7, 0, 0x12d8fb ; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; GISEL-NEXT: s_subb_u32 s10, 0, 0 -; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s12, -1, 0x10000 +; GISEL-NEXT: s_subb_u32 s8, 0, 0 +; GISEL-NEXT: s_bfe_i32 s9, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s10, -1, 0x10000 ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v7 +; GISEL-NEXT: v_mov_b32_e32 v11, s9 +; GISEL-NEXT: v_mov_b32_e32 v12, s10 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 ; GISEL-NEXT: v_trunc_f32_e32 v10, v10 ; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 @@ -1130,185 +1132,183 @@ ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v10 ; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, s6, v9 +; GISEL-NEXT: v_mul_lo_u32 v13, s5, v9 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, s9, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, s6, v8 -; GISEL-NEXT: v_mul_lo_u32 v14, s7, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, s6, v8 -; GISEL-NEXT: v_mul_lo_u32 v16, s9, v7 -; GISEL-NEXT: v_mul_lo_u32 v17, s10, v7 -; GISEL-NEXT: v_mul_hi_u32 v18, s9, v7 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v9, v13 -; GISEL-NEXT: v_mul_hi_u32 v19, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v14, s7, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, s5, v8 +; GISEL-NEXT: v_mul_lo_u32 v16, s6, v8 +; GISEL-NEXT: v_mul_hi_u32 v17, s5, v8 +; GISEL-NEXT: v_mul_lo_u32 v18, s7, v7 +; GISEL-NEXT: v_mul_lo_u32 v19, s8, v7 +; GISEL-NEXT: v_mul_hi_u32 v20, s7, v7 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_mul_lo_u32 v16, v9, v15 +; GISEL-NEXT: v_mul_hi_u32 v21, v8, v15 +; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v19, v14 +; GISEL-NEXT: v_mul_lo_u32 v19, v10, v18 +; GISEL-NEXT: v_mul_hi_u32 v22, v7, v18 +; GISEL-NEXT: v_mul_hi_u32 v18, v10, v18 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v20 +; GISEL-NEXT: v_mul_lo_u32 v17, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v20, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v23, v8, v13 ; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v16 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v7, v16 -; GISEL-NEXT: v_mul_hi_u32 v16, v10, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v7, v12 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v11 -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v7, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v24, v7, v14 +; GISEL-NEXT: v_mul_lo_u32 v25, v10, v14 +; GISEL-NEXT: v_mul_hi_u32 v26, v7, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v14 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, s6, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, s7, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, s6, v8 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v16 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, s9, v7 -; GISEL-NEXT: v_mul_lo_u32 v15, s10, v7 -; GISEL-NEXT: v_mul_hi_u32 v16, s9, v7 -; GISEL-NEXT: v_mul_lo_u32 v17, s6, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, s9, v10 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v7, v15 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v13 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v13 -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v10, v15 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v24 +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v25, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v21 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v7, v15 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v12 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v22 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mov_b32_e32 v19, s11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 -; GISEL-NEXT: v_mov_b32_e32 v18, s12 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v10, v15 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v26 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v20, v21 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v24, v19 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v25, v22 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v16 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v20, v19 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v13, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, s5, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, s6, v8 +; GISEL-NEXT: v_mul_hi_u32 v16, s5, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v18 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, s7, v7 +; GISEL-NEXT: v_mul_lo_u32 v17, s8, v7 +; GISEL-NEXT: v_mul_hi_u32 v18, s7, v7 +; GISEL-NEXT: v_mul_lo_u32 v19, s5, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v21, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_mul_lo_u32 v22, s7, v10 +; GISEL-NEXT: v_mul_lo_u32 v23, v10, v14 +; GISEL-NEXT: v_mul_hi_u32 v24, v7, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v14 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v22 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v18 +; GISEL-NEXT: v_mul_lo_u32 v17, v8, v15 +; GISEL-NEXT: v_mul_lo_u32 v18, v9, v15 +; GISEL-NEXT: v_mul_hi_u32 v19, v8, v15 +; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15 +; GISEL-NEXT: v_mul_lo_u32 v22, v7, v16 +; GISEL-NEXT: v_mul_lo_u32 v25, v10, v16 +; GISEL-NEXT: v_mul_hi_u32 v26, v7, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v10, v16 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v20, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v23, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v25, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v21, vcc, v22, v24 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v26 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v20, v17 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v19 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v21 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v25, v22 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v20, v19 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v15, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v15, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v16, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 -; GISEL-NEXT: v_mul_lo_u32 v15, v0, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v18, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v19, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_lo_u32 v11, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v20, v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v21, v3, v10 +; GISEL-NEXT: v_mul_hi_u32 v22, v2, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v16, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v7, s[6:7], v13, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v17 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v18, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v21, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_lo_u32 v14, s8, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, 0, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, s8, v8 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_lo_u32 v12, s8, v7 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, s8, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v20, v14 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v21, v16 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_mul_lo_u32 v15, s4, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, 0, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, s4, v8 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v16, s4, v7 +; GISEL-NEXT: v_mul_lo_u32 v18, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, s4, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_mul_lo_u32 v9, s8, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, s8, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_mul_lo_u32 v9, s4, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, s4, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v17, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v18, v10 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v15 ; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc ; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v12 +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v16 ; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v3, v7, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v7 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v4 @@ -1317,32 +1317,32 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v4 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc ; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v8, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v1, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v13, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v11, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v4, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc @@ -1352,251 +1352,251 @@ ; CGP-LABEL: v_urem_v2i64_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s8, 0x12d8fb +; CGP-NEXT: s_mov_b32 s4, 0x12d8fb ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; CGP-NEXT: s_mov_b32 s6, 0xffed2705 -; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CGP-NEXT: s_bfe_i32 s5, -1, 0x10000 +; CGP-NEXT: s_mov_b32 s5, 0xffed2705 +; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s7, -1, 0x10000 ; CGP-NEXT: v_cvt_f32_u32_e32 v7, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v8, 0 -; CGP-NEXT: s_bfe_i32 s7, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 ; CGP-NEXT: s_bfe_i32 s9, -1, 0x10000 ; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; CGP-NEXT: v_mov_b32_e32 v6, s4 -; CGP-NEXT: v_mov_b32_e32 v9, s5 +; CGP-NEXT: v_mov_b32_e32 v6, s6 +; CGP-NEXT: v_mov_b32_e32 v9, s7 ; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 +; CGP-NEXT: v_mov_b32_e32 v8, s8 +; CGP-NEXT: v_mov_b32_e32 v10, s9 ; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v5 -; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v7 -; CGP-NEXT: v_trunc_f32_e32 v8, v8 -; CGP-NEXT: v_trunc_f32_e32 v10, v10 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 +; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v5 +; CGP-NEXT: v_mul_f32_e32 v12, 0x2f800000, v7 +; CGP-NEXT: v_trunc_f32_e32 v11, v11 +; CGP-NEXT: v_trunc_f32_e32 v12, v12 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v11 +; CGP-NEXT: v_cvt_u32_f32_e32 v11, v11 +; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v12 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v12 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v8 +; CGP-NEXT: v_mul_lo_u32 v13, s5, v11 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 -; CGP-NEXT: v_mul_lo_u32 v13, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v16, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v17, -1, v7 -; CGP-NEXT: v_mul_hi_u32 v18, s6, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; CGP-NEXT: v_mul_lo_u32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v19, v5, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; CGP-NEXT: v_mul_lo_u32 v17, v10, v16 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v7, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v10, v16 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; CGP-NEXT: v_mul_lo_u32 v18, v7, v12 -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; CGP-NEXT: v_mul_lo_u32 v15, v5, v11 -; CGP-NEXT: v_mul_lo_u32 v17, v8, v11 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 -; CGP-NEXT: v_mul_hi_u32 v14, v5, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19 -; CGP-NEXT: v_mul_lo_u32 v19, v10, v12 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; CGP-NEXT: v_mul_hi_u32 v18, v7, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v19, v16 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, s5, v12 +; CGP-NEXT: v_mul_lo_u32 v15, s5, v5 +; CGP-NEXT: v_mul_lo_u32 v16, -1, v5 +; CGP-NEXT: v_mul_hi_u32 v17, s5, v5 +; CGP-NEXT: v_mul_lo_u32 v18, s5, v7 +; CGP-NEXT: v_mul_lo_u32 v19, -1, v7 +; CGP-NEXT: v_mul_hi_u32 v20, s5, v7 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; CGP-NEXT: v_mul_lo_u32 v16, v11, v15 +; CGP-NEXT: v_mul_hi_u32 v21, v5, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v11, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v19, v14 +; CGP-NEXT: v_mul_lo_u32 v19, v12, v18 +; CGP-NEXT: v_mul_hi_u32 v22, v7, v18 +; CGP-NEXT: v_mul_hi_u32 v18, v12, v18 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v20 +; CGP-NEXT: v_mul_lo_u32 v17, v5, v13 +; CGP-NEXT: v_mul_lo_u32 v20, v11, v13 +; CGP-NEXT: v_mul_hi_u32 v23, v5, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v11, v13 +; CGP-NEXT: v_mul_lo_u32 v24, v7, v14 +; CGP-NEXT: v_mul_lo_u32 v25, v12, v14 +; CGP-NEXT: v_mul_hi_u32 v26, v7, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v12, v14 ; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v18, v17 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v11, vcc -; CGP-NEXT: v_mul_lo_u32 v11, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v13, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v16 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc -; CGP-NEXT: v_mul_lo_u32 v12, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v15, -1, v7 -; CGP-NEXT: v_mul_hi_u32 v16, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v17, s6, v8 -; CGP-NEXT: v_mul_lo_u32 v18, v8, v11 -; CGP-NEXT: v_mul_hi_u32 v19, v5, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; CGP-NEXT: v_mul_lo_u32 v17, s6, v10 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 -; CGP-NEXT: v_mul_lo_u32 v17, v10, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; CGP-NEXT: v_mul_lo_u32 v16, v7, v15 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v20, v15 +; CGP-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v19, vcc, v19, v24 +; CGP-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v18, vcc, v25, v18 +; CGP-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v21 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v23 +; CGP-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v19, vcc, v19, v22 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v26 +; CGP-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_mul_lo_u32 v14, v5, v13 -; CGP-NEXT: v_mul_lo_u32 v16, v8, v13 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 -; CGP-NEXT: v_mul_hi_u32 v14, v5, v13 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v18, s[4:5], v18, v19 -; CGP-NEXT: v_mul_lo_u32 v19, v10, v15 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 +; CGP-NEXT: v_add_i32_e32 v17, vcc, v20, v21 +; CGP-NEXT: v_add_i32_e32 v19, vcc, v24, v19 +; CGP-NEXT: v_add_i32_e32 v20, vcc, v25, v22 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v19 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_mul_hi_u32 v17, v7, v15 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v19, v12 +; CGP-NEXT: v_add_i32_e32 v17, vcc, v20, v19 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc +; CGP-NEXT: v_mul_lo_u32 v13, s5, v5 +; CGP-NEXT: v_mul_lo_u32 v15, -1, v5 +; CGP-NEXT: v_mul_hi_u32 v16, s5, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v18 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v14, vcc +; CGP-NEXT: v_mul_lo_u32 v14, s5, v7 +; CGP-NEXT: v_mul_lo_u32 v17, -1, v7 +; CGP-NEXT: v_mul_hi_u32 v18, s5, v7 +; CGP-NEXT: v_mul_lo_u32 v19, s5, v11 +; CGP-NEXT: v_mul_lo_u32 v20, v11, v13 +; CGP-NEXT: v_mul_hi_u32 v21, v5, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v11, v13 +; CGP-NEXT: v_mul_lo_u32 v22, s5, v12 +; CGP-NEXT: v_mul_lo_u32 v23, v12, v14 +; CGP-NEXT: v_mul_hi_u32 v24, v7, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v12, v14 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v22 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v18 +; CGP-NEXT: v_mul_lo_u32 v17, v5, v15 +; CGP-NEXT: v_mul_lo_u32 v18, v11, v15 +; CGP-NEXT: v_mul_hi_u32 v19, v5, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v11, v15 +; CGP-NEXT: v_mul_lo_u32 v22, v7, v16 +; CGP-NEXT: v_mul_lo_u32 v25, v12, v16 +; CGP-NEXT: v_mul_hi_u32 v26, v7, v16 +; CGP-NEXT: v_mul_hi_u32 v16, v12, v16 +; CGP-NEXT: v_add_i32_e32 v17, vcc, v20, v17 +; CGP-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v22, vcc, v23, v22 +; CGP-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v25, v14 +; CGP-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v21 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v19 ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; CGP-NEXT: v_add_i32_e32 v21, vcc, v22, v24 +; CGP-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v26 +; CGP-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, v20, v17 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v19 +; CGP-NEXT: v_add_i32_e32 v19, vcc, v23, v21 +; CGP-NEXT: v_add_i32_e32 v20, vcc, v25, v22 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; CGP-NEXT: v_mov_b32_e32 v19, s7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v18 -; CGP-NEXT: v_mov_b32_e32 v18, s9 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v15 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v16 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v1, v5 -; CGP-NEXT: v_mul_hi_u32 v13, v0, v5 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v20, v19 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v18 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v11, v15, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v1, v5 +; CGP-NEXT: v_mul_hi_u32 v15, v0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v3, v7 -; CGP-NEXT: v_mul_hi_u32 v14, v2, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v16, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v16, v2, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_mul_lo_u32 v15, v0, v8 -; CGP-NEXT: v_mul_lo_u32 v16, v1, v8 -; CGP-NEXT: v_mul_hi_u32 v17, v0, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; CGP-NEXT: v_mul_lo_u32 v17, v0, v11 +; CGP-NEXT: v_mul_lo_u32 v18, v1, v11 +; CGP-NEXT: v_mul_hi_u32 v19, v0, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v1, v11 +; CGP-NEXT: v_mul_lo_u32 v20, v2, v12 +; CGP-NEXT: v_mul_lo_u32 v21, v3, v12 +; CGP-NEXT: v_mul_hi_u32 v22, v2, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v3, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v18, v5 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v20 +; CGP-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v21, v7 +; CGP-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v19 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; CGP-NEXT: v_mul_lo_u32 v11, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v13, v3, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_hi_u32 v11, v2, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v3, v10 -; CGP-NEXT: v_add_i32_e64 v5, s[6:7], v16, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v7, s[6:7], v13, v7 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v22 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v18, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v20, v14 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v21, v16 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_lo_u32 v14, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v15, 0, v5 -; CGP-NEXT: v_mul_hi_u32 v5, s8, v5 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v12, s8, v7 -; CGP-NEXT: v_mul_lo_u32 v16, 0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, s8, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_mul_lo_u32 v8, s8, v8 -; CGP-NEXT: v_mul_lo_u32 v10, s8, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v16, v10 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v5, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_mul_lo_u32 v15, s4, v5 +; CGP-NEXT: v_mul_lo_u32 v17, 0, v5 +; CGP-NEXT: v_mul_hi_u32 v5, s4, v5 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_mul_lo_u32 v16, s4, v7 +; CGP-NEXT: v_mul_lo_u32 v18, 0, v7 +; CGP-NEXT: v_mul_hi_u32 v7, s4, v7 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_mul_lo_u32 v11, s4, v11 +; CGP-NEXT: v_mul_lo_u32 v12, s4, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v17, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v18, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v15 +; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v1, v5, vcc ; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v5 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v12 -; CGP-NEXT: v_subb_u32_e64 v10, s[6:7], v3, v7, s[4:5] +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v16 +; CGP-NEXT: v_subb_u32_e64 v12, s[6:7], v3, v7, s[4:5] ; CGP-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v7 ; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7] -; CGP-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v8 +; CGP-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v11 ; CGP-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[6:7] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v6, v19, v7, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; CGP-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc ; CGP-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v0, v4 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; CGP-NEXT: v_sub_i32_e32 v12, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; CGP-NEXT: v_sub_i32_e32 v13, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v12, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v7, v4 -; CGP-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v7, v4 +; CGP-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 -; CGP-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc +; CGP-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v3, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v4, v13, v4, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5] ; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v3, v12, v3, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i64> %num, ret <2 x i64> %result @@ -1783,256 +1783,256 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 -; GISEL-NEXT: v_lshl_b64 v[7:8], s[4:5], v4 -; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 -; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 +; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v5 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc +; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v7, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 +; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 +; GISEL-NEXT: v_trunc_f32_e32 v11, v11 +; GISEL-NEXT: v_trunc_f32_e32 v12, v12 +; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v7 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6 -; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v15, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v15, v6, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v6, v12 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v6, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 +; GISEL-NEXT: v_mul_lo_u32 v19, v8, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v8, v9 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 +; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 +; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 +; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 +; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 +; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 +; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 +; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 +; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] +; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] +; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 +; GISEL-NEXT: v_mul_lo_u32 v20, v8, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 +; GISEL-NEXT: v_mul_hi_u32 v18, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 +; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 +; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v9, v8 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v11, v8 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 +; GISEL-NEXT: v_mul_hi_u32 v15, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 +; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 +; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 +; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 +; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] +; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v7, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 -; GISEL-NEXT: v_mul_lo_u32 v9, v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v6 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v7 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v11, vcc, 0, v1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v10, v7 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v8, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 -; GISEL-NEXT: v_trunc_f32_e32 v7, v7 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v11 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v8, vcc +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8 -; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6 -; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v6 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v9, v5, v6 -; GISEL-NEXT: v_mul_lo_u32 v7, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v4 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v3, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v16, v4, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 +; GISEL-NEXT: v_mul_lo_u32 v18, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v13 +; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v6 +; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v4 +; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v6 +; GISEL-NEXT: v_sub_i32_e64 v4, s[14:15], v12, v4 +; GISEL-NEXT: v_sub_i32_e64 v6, s[16:17], v13, v6 +; GISEL-NEXT: v_add_i32_e64 v8, s[18:19], v17, v8 +; GISEL-NEXT: v_add_i32_e64 v11, s[18:19], v19, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v9 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v11, v10 +; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v8, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v8 +; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], v3, v9, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v5 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v8, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v5, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v18, vcc, 0, v1, s[10:11] +; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e64 v14, vcc, 0, v3, s[12:13] +; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v7, s[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[14:15] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_subbrev_u32_e64 v3, s[6:7], 0, v3, s[16:17] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[8:9] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v6, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v1, v18, v1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v3, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_pow2_shl_denom: @@ -2428,255 +2428,255 @@ ; GISEL-LABEL: v_urem_v2i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v4 -; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v6 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v3 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v1 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v8 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 -; GISEL-NEXT: v_trunc_f32_e32 v8, v8 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 +; GISEL-NEXT: s_bfe_i32 s12, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s13, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s14, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s15, -1, 0x10000 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v0 +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 0, v0 +; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v7 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v7, v7 ; GISEL-NEXT: v_trunc_f32_e32 v11, v11 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 -; GISEL-NEXT: v_mul_lo_u32 v15, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v12 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v11 +; GISEL-NEXT: v_mul_lo_u32 v14, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v15, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v17, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v18, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v19, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v7, v14 +; GISEL-NEXT: v_mul_hi_u32 v20, v4, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17 +; GISEL-NEXT: v_mul_lo_u32 v18, v11, v17 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v5, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v5, v13 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v4, v12 +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v18, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v7, v12 +; GISEL-NEXT: v_mul_hi_u32 v18, v4, v12 +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v15, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v11, v13 +; GISEL-NEXT: v_add_i32_e64 v14, s[8:9], v16, v14 +; GISEL-NEXT: v_mul_hi_u32 v16, v5, v13 +; GISEL-NEXT: v_add_i32_e64 v17, s[10:11], v19, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 -; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v6, v16 -; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v6, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v17 -; GISEL-NEXT: v_mul_hi_u32 v17, v7, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v6, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6 -; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 -; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s6, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s7, -1, 0x10000 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v18, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[10:11] +; GISEL-NEXT: v_add_i32_e64 v16, s[8:9], v17, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v20, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v20 +; GISEL-NEXT: v_mov_b32_e32 v20, s12 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_mov_b32_e32 v15, s4 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v20, v18 -; GISEL-NEXT: v_mov_b32_e32 v19, s5 -; GISEL-NEXT: v_mul_hi_u32 v16, v8, v16 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; GISEL-NEXT: v_mov_b32_e32 v16, s6 +; GISEL-NEXT: v_mov_b32_e32 v15, s13 +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v16, v19 +; GISEL-NEXT: v_mov_b32_e32 v19, s14 +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v14 +; GISEL-NEXT: v_mov_b32_e32 v14, s15 +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v12 ; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 +; GISEL-NEXT: v_add_i32_e64 v5, s[8:9], v5, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 +; GISEL-NEXT: v_mul_lo_u32 v18, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; GISEL-NEXT: v_mul_lo_u32 v17, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v5 +; GISEL-NEXT: v_addc_u32_e64 v7, vcc, v7, v12, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v5 +; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v13, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v13, v4, v18 +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_mul_hi_u32 v9, v5, v17 +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v18 +; GISEL-NEXT: v_mul_hi_u32 v18, v7, v18 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v6 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; GISEL-NEXT: v_mul_hi_u32 v10, v4, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v5, v8 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v11, v8 +; GISEL-NEXT: v_add_i32_e64 v13, s[8:9], v16, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 +; GISEL-NEXT: v_add_i32_e64 v17, s[10:11], v18, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v12, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[10:11] +; GISEL-NEXT: v_add_i32_e64 v16, s[8:9], v17, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v2 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v2 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v2, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_mul_lo_u32 v10, v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v4 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v11, v9 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v12, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_mul_hi_u32 v17, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; GISEL-NEXT: v_mov_b32_e32 v18, s7 -; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v2 -; GISEL-NEXT: v_mul_hi_u32 v8, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v6, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v16 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v12, v17 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v13, v3, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v17, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_mul_lo_u32 v10, 0, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, 0, v9 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v13, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v9 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v10 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc +; GISEL-NEXT: v_addc_u32_e64 v7, vcc, v11, v8, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v18, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v9, v4 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v16, v11 +; GISEL-NEXT: v_add_i32_e64 v5, s[8:9], v18, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v3, v2 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v10 -; GISEL-NEXT: v_mul_lo_u32 v4, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v7 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v8 -; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v2, vcc -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], 0, v2 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v11 -; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], 0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v10 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v17 +; GISEL-NEXT: v_add_i32_e64 v5, s[8:9], v5, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v18, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v1, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 +; GISEL-NEXT: v_mul_lo_u32 v16, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v17, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v12 +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v9 +; GISEL-NEXT: v_add_i32_e64 v7, s[6:7], v7, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v3, v1 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v0 +; GISEL-NEXT: v_sub_i32_e64 v8, s[10:11], v3, v1 +; GISEL-NEXT: v_sub_i32_e64 v9, s[12:13], v2, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[8:9] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v9, v0 +; GISEL-NEXT: v_sub_i32_e64 v1, s[14:15], v8, v1 +; GISEL-NEXT: v_sub_i32_e64 v0, s[16:17], v9, v0 +; GISEL-NEXT: v_add_i32_e64 v6, s[18:19], v13, v6 +; GISEL-NEXT: v_add_i32_e64 v7, s[18:19], v17, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v6, v4 +; GISEL-NEXT: v_add_i32_e64 v5, s[6:7], v7, v5 +; GISEL-NEXT: v_subb_u32_e64 v6, s[6:7], 0, v4, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[6:7], 0, v4 +; GISEL-NEXT: v_subb_u32_e64 v7, s[6:7], 0, v5, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v5, s[6:7], 0, v5 ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v0, vcc, 0, v0, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v5, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v8, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_subbrev_u32_e64 v5, s[4:5], 0, v5, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v11, v19, v11, vcc -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v2, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v13, v15, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e64 v4, vcc, 0, v4, s[10:11] +; GISEL-NEXT: v_subbrev_u32_e64 v5, vcc, 0, v5, s[12:13] +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v4, s[14:15] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v17, s[6:7], 0, v5, s[16:17] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v10 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v12, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, v14, v13, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, v9, v0, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v17, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[8:9] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -1358,29 +1358,29 @@ ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v16 ; GFX6-NEXT: v_min_u32_e32 v16, v3, v19 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v16 +; GFX6-NEXT: v_min_u32_e32 v16, v4, v20 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v16 +; GFX6-NEXT: v_min_u32_e32 v16, v5, v21 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v16 +; GFX6-NEXT: v_min_u32_e32 v16, v6, v22 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v16 +; GFX6-NEXT: v_min_u32_e32 v16, v7, v23 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v16 +; GFX6-NEXT: v_min_u32_e32 v16, v8, v24 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v16 +; GFX6-NEXT: v_min_u32_e32 v16, v9, v25 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 +; GFX6-NEXT: v_min_u32_e32 v16, v10, v26 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v16 ; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX6-NEXT: v_min_u32_e32 v17, v4, v20 -; GFX6-NEXT: v_min_u32_e32 v18, v5, v21 -; GFX6-NEXT: v_min_u32_e32 v19, v6, v22 -; GFX6-NEXT: v_min_u32_e32 v20, v7, v23 -; GFX6-NEXT: v_min_u32_e32 v21, v8, v24 -; GFX6-NEXT: v_min_u32_e32 v22, v9, v25 -; GFX6-NEXT: v_min_u32_e32 v23, v10, v26 -; GFX6-NEXT: v_min_u32_e32 v24, v11, v27 -; GFX6-NEXT: v_min_u32_e32 v25, v12, v28 -; GFX6-NEXT: v_min_u32_e32 v26, v13, v29 -; GFX6-NEXT: v_min_u32_e32 v27, v14, v30 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v18 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v19 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v20 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v21 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v22 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v23 -; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v24 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v25 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v26 -; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v27 +; GFX6-NEXT: v_min_u32_e32 v17, v11, v27 +; GFX6-NEXT: v_min_u32_e32 v18, v12, v28 +; GFX6-NEXT: v_min_u32_e32 v19, v13, v29 +; GFX6-NEXT: v_min_u32_e32 v20, v14, v30 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v18 +; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v19 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v20 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_min_u32_e32 v16, v15, v16 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -2,8 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs -; TRAP-HANDLER-ENABLE: NumSgprs: 61 -; TRAP-HANDLER-DISABLE: NumSgprs: 77 +; TRAP-HANDLER-ENABLE: NumSgprs: 77 +; TRAP-HANDLER-DISABLE: NumSgprs: 92 define amdgpu_kernel void @amdhsa_trap_num_sgprs( i32 addrspace(1)* %out0, i32 %in0, i32 addrspace(1)* %out1, i32 %in1, diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -4,13 +4,13 @@ define amdgpu_kernel void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 { ; CHECK-LABEL: spill: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s27, s[4:5], 0x2 +; CHECK-NEXT: s_load_dword s44, s[4:5], 0x2 ; CHECK-NEXT: s_mov_b64 s[98:99], s[2:3] ; CHECK-NEXT: s_mov_b64 s[96:97], s[0:1] ; CHECK-NEXT: s_add_u32 s96, s96, s7 ; CHECK-NEXT: s_addc_u32 s97, s97, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_cmp_eq_u32 s27, 0 +; CHECK-NEXT: s_cmp_eq_u32 s44, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: ;;#ASMEND @@ -973,10 +973,10 @@ ; CHECK-NEXT: v_writelane_b32 v1, s99, 4 ; CHECK-NEXT: v_writelane_b32 v0, s93, 62 ; CHECK-NEXT: v_writelane_b32 v1, s100, 5 -; CHECK-NEXT: s_mov_b32 s31, s12 +; CHECK-NEXT: s_mov_b32 s49, s12 ; CHECK-NEXT: v_writelane_b32 v0, s94, 63 ; CHECK-NEXT: v_writelane_b32 v1, s101, 6 -; CHECK-NEXT: s_cmp_eq_u32 s31, 0 +; CHECK-NEXT: s_cmp_eq_u32 s49, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir --- a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir +++ b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir @@ -49,29 +49,29 @@ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: undef %11.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: undef %17.sub0:vreg_64, %18:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF3]].sub0, [[DEF5]].sub0, 0, implicit $exec - ; CHECK-NEXT: dead undef %17.sub1:vreg_64, dead %19:sreg_64_xexec = V_ADDC_U32_e64 [[DEF3]].sub1, [[DEF5]].sub1, %18, 0, implicit $exec - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF7]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]] - ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF1]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub1 + ; CHECK-NEXT: undef %17.sub0:vreg_64, %18:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF4]].sub0, [[DEF6]].sub0, 0, implicit $exec + ; CHECK-NEXT: dead undef %17.sub1:vreg_64, dead %19:sreg_64_xexec = V_ADDC_U32_e64 [[DEF4]].sub1, [[DEF6]].sub1, %18, 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF3]] + ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub1 ; CHECK-NEXT: dead [[COPY6:%[0-9]+]]:vgpr_32 = COPY %11.sub0 - ; CHECK-NEXT: dead [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF6]], implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF7]], 288, 0, implicit $exec :: (store (s64), addrspace 1) + ; CHECK-NEXT: dead [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF7]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF8]], 288, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -81,7 +81,7 @@ ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: undef [[DEF4]].sub1:vreg_64 = COPY [[COPY5]] + ; CHECK-NEXT: undef [[DEF5]].sub1:vreg_64 = COPY [[COPY5]] ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir --- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir +++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir @@ -24,7 +24,7 @@ ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF @@ -32,9 +32,10 @@ ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF]], implicit $mode, implicit $exec ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF1]], implicit $mode, implicit $exec ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -50,34 +51,33 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]] - ; CHECK-NEXT: [[V_MUL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec - ; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MUL_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF12]], implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MAC_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[V_ADD_F32_e32_]], [[COPY]], [[V_MAC_F32_e32_]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[DEF13:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MUL_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec + ; CHECK-NEXT: dead %23:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF13]], implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[V_MOV_B32_e32_1]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[V_ADD_F32_e32_]], [[COPY]], [[V_MOV_B32_e32_1]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF14:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; CHECK-NEXT: $sgpr4 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = COPY [[DEF10]] + ; CHECK-NEXT: $vgpr0 = COPY [[DEF11]] ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]] - ; CHECK-NEXT: $vgpr1 = COPY [[DEF6]] + ; CHECK-NEXT: $vgpr1 = COPY [[DEF7]] ; CHECK-NEXT: $vgpr0 = COPY [[V_MUL_F32_e32_1]] ; CHECK-NEXT: $vgpr1 = COPY [[V_MUL_F32_e32_2]] ; CHECK-NEXT: $vgpr2 = COPY [[V_MUL_F32_e32_3]] - ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL [[DEF13]], @foo, csr_amdgpu, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0 - ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MUL_F32_e32_]], [[DEF7]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MAC_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF11]], [[DEF8]], [[V_MAC_F32_e32_1]], implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_1]], 0, [[DEF3]], 0, [[DEF]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_1]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_1]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: GLOBAL_STORE_DWORD [[DEF14]], [[DEF9]], 0, 0, implicit $exec + ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL [[DEF14]], @foo, csr_amdgpu, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0 + ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MUL_F32_e32_]], [[DEF8]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MAC_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF12]], [[DEF9]], [[V_MAC_F32_e32_]], implicit $mode, implicit $exec + ; CHECK-NEXT: dead %26:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead %27:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead %28:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_]], 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -11,24 +11,25 @@ ; CHECK-NEXT: s_addc_u32 s25, s25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s0, 0 -; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s0, 8 -; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 +; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s0, 16 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s0, 24 -; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; CHECK-NEXT: s_xor_b64 s[2:3], s[6:7], -1 +; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 +; CHECK-NEXT: s_xor_b64 s[4:5], s[8:9], -1 ; CHECK-NEXT: s_bitcmp1_b32 s1, 0 -; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s1, 8 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[14:15] +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; CHECK-NEXT: s_cselect_b64 s[12:13], -1, 0 +; CHECK-NEXT: s_bitcmp1_b32 s1, 8 +; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v0 +; CHECK-NEXT: s_and_b64 s[4:5], exec, s[4:5] +; CHECK-NEXT: s_and_b64 s[6:7], exec, s[10:11] +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 -; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] -; CHECK-NEXT: s_and_b64 s[4:5], exec, s[8:9] -; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_branch .LBB0_3 ; CHECK-NEXT: .LBB0_1: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[18:19], 0 @@ -41,17 +42,17 @@ ; CHECK-NEXT: s_cbranch_vccnz .LBB0_12 ; CHECK-NEXT: .LBB0_3: ; %bb7 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_and_b64 vcc, exec, s[2:3] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 ; CHECK-NEXT: ; %bb.4: ; %bb8 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 vcc, s[2:3] +; CHECK-NEXT: s_mov_b64 vcc, s[4:5] ; CHECK-NEXT: s_cbranch_vccz .LBB0_6 ; CHECK-NEXT: ; %bb.5: ; %bb9 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[16:17], 0 ; CHECK-NEXT: s_mov_b64 s[18:19], -1 -; CHECK-NEXT: s_mov_b64 s[22:23], s[8:9] +; CHECK-NEXT: s_mov_b64 s[22:23], s[10:11] ; CHECK-NEXT: s_cbranch_execz .LBB0_7 ; CHECK-NEXT: s_branch .LBB0_8 ; CHECK-NEXT: .LBB0_6: ; in Loop: Header=BB0_3 Depth=1 @@ -62,7 +63,7 @@ ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[18:19], -1 ; CHECK-NEXT: s_mov_b64 s[16:17], 0 -; CHECK-NEXT: s_mov_b64 s[22:23], s[12:13] +; CHECK-NEXT: s_mov_b64 s[22:23], s[14:15] ; CHECK-NEXT: .LBB0_8: ; %Flow9 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[20:21], -1 @@ -71,12 +72,12 @@ ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: ; %bb.9: ; %bb13 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 vcc, s[4:5] +; CHECK-NEXT: s_mov_b64 vcc, s[6:7] ; CHECK-NEXT: s_cbranch_vccz .LBB0_11 ; CHECK-NEXT: ; %bb.10: ; %bb16 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[16:17], 0 -; CHECK-NEXT: s_mov_b64 s[22:23], s[10:11] +; CHECK-NEXT: s_mov_b64 s[22:23], s[12:13] ; CHECK-NEXT: s_mov_b64 s[18:19], s[16:17] ; CHECK-NEXT: s_branch .LBB0_2 ; CHECK-NEXT: .LBB0_11: ; in Loop: Header=BB0_3 Depth=1 @@ -86,18 +87,18 @@ ; CHECK-NEXT: s_branch .LBB0_2 ; CHECK-NEXT: .LBB0_12: ; %loop.exit.guard6 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_xor_b64 s[14:15], s[20:21], -1 +; CHECK-NEXT: s_xor_b64 s[22:23], s[20:21], -1 ; CHECK-NEXT: s_mov_b64 s[20:21], -1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[14:15] +; CHECK-NEXT: s_and_b64 vcc, exec, s[22:23] ; CHECK-NEXT: s_cbranch_vccz .LBB0_16 ; CHECK-NEXT: ; %bb.13: ; %bb14 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_15 ; CHECK-NEXT: ; %bb.14: ; %bb15 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:4 -; CHECK-NEXT: buffer_store_dword v1, off, s[24:27], 0 +; CHECK-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 +; CHECK-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; CHECK-NEXT: .LBB0_15: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[20:21], 0 @@ -112,10 +113,10 @@ ; CHECK-NEXT: s_and_b64 vcc, exec, s[18:19] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_23 ; CHECK-NEXT: ; %bb.19: ; %bb17 -; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7] +; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] ; CHECK-NEXT: s_cbranch_vccz .LBB0_21 ; CHECK-NEXT: ; %bb.20: ; %bb19 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_cbranch_vccz .LBB0_22 ; CHECK-NEXT: .LBB0_21: ; %bb18 ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -888,57 +888,57 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 -; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: s_addc_u32 s5, s3, 0 ; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v14, s3 -; CI-NEXT: v_mov_b32_e32 v13, s2 -; CI-NEXT: s_add_u32 s2, s0, 48 -; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v3 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v6 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; CI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; CI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; CI-NEXT: v_mov_b32_e32 v7, s3 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; CI-NEXT: v_mov_b32_e32 v6, s2 +; CI-NEXT: s_add_u32 s2, s0, 48 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; CI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 ; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v25 ; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v1 ; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v24 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_mov_b32_e32 v17, s1 -; CI-NEXT: v_mov_b32_e32 v14, s2 -; CI-NEXT: v_mov_b32_e32 v16, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_mov_b32_e32 v21, s3 +; CI-NEXT: v_mov_b32_e32 v23, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_mov_b32_e32 v20, s2 +; CI-NEXT: v_mov_b32_e32 v22, s0 +; CI-NEXT: flat_store_dwordx4 v[6:7], v[16:19] ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; CI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; CI-NEXT: flat_store_dwordx4 v[16:17], v[6:9] +; CI-NEXT: flat_store_dwordx4 v[20:21], v[12:15] +; CI-NEXT: flat_store_dwordx4 v[22:23], v[8:11] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v16f16_to_v16f32: @@ -947,24 +947,26 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v19, s3 -; VI-NEXT: v_mov_b32_e32 v18, s2 +; VI-NEXT: v_mov_b32_e32 v23, s3 +; VI-NEXT: v_mov_b32_e32 v22, s2 ; VI-NEXT: s_add_u32 s2, s0, 48 -; VI-NEXT: v_mov_b32_e32 v17, s1 +; VI-NEXT: v_mov_b32_e32 v21, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v16, s0 +; VI-NEXT: v_mov_b32_e32 v20, s0 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v21, s3 -; VI-NEXT: v_mov_b32_e32 v20, s2 +; VI-NEXT: v_mov_b32_e32 v25, s3 +; VI-NEXT: v_mov_b32_e32 v27, s1 +; VI-NEXT: v_mov_b32_e32 v24, s2 +; VI-NEXT: v_mov_b32_e32 v26, s0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v14, v3 ; VI-NEXT: v_cvt_f32_f16_e32 v12, v2 @@ -974,21 +976,19 @@ ; VI-NEXT: v_cvt_f32_f16_e32 v8, v0 ; VI-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v18, v7 +; VI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; VI-NEXT: v_cvt_f32_f16_sdwa v19, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; VI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; VI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; VI-NEXT: v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v4 ; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; VI-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: flat_store_dwordx4 v[22:23], v[12:15] +; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11] +; VI-NEXT: flat_store_dwordx4 v[24:25], v[16:19] +; VI-NEXT: flat_store_dwordx4 v[26:27], v[0:3] ; VI-NEXT: s_endpgm %val = load <16 x half>, ptr addrspace(1) %in %cvt = fpext <16 x half> %val to <16 x float> @@ -1183,43 +1183,43 @@ ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v7, s3 -; CI-NEXT: v_mov_b32_e32 v6, s2 +; CI-NEXT: v_mov_b32_e32 v19, s3 +; CI-NEXT: v_mov_b32_e32 v18, s2 ; CI-NEXT: s_add_u32 s2, s0, 32 -; CI-NEXT: v_mov_b32_e32 v13, s1 +; CI-NEXT: v_mov_b32_e32 v17, s1 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v12, s0 +; CI-NEXT: v_mov_b32_e32 v16, s0 ; CI-NEXT: s_add_u32 s0, s0, 16 -; CI-NEXT: v_mov_b32_e32 v15, s3 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_mov_b32_e32 v21, s3 +; CI-NEXT: v_mov_b32_e32 v23, s1 +; CI-NEXT: v_mov_b32_e32 v20, s2 +; CI-NEXT: v_mov_b32_e32 v22, s0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v5 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v18, v11 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; CI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 -; CI-NEXT: v_mov_b32_e32 v17, s1 -; CI-NEXT: v_mov_b32_e32 v16, s0 -; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; CI-NEXT: v_cvt_f32_f16_e32 v10, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v3 +; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v10 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v11 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v24 +; CI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; CI-NEXT: flat_store_dwordx4 v[20:21], v[8:11] +; CI-NEXT: flat_store_dwordx4 v[22:23], v[4:7] ; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v8f16_to_v8f64: @@ -1231,39 +1231,39 @@ ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v8, s3 -; VI-NEXT: v_mov_b32_e32 v7, s2 +; VI-NEXT: v_mov_b32_e32 v19, s3 +; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 -; VI-NEXT: v_mov_b32_e32 v13, s1 +; VI-NEXT: v_mov_b32_e32 v17, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v12, s0 +; VI-NEXT: v_mov_b32_e32 v16, s0 ; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: v_mov_b32_e32 v15, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v14, s2 +; VI-NEXT: v_mov_b32_e32 v21, s3 +; VI-NEXT: v_mov_b32_e32 v23, s1 +; VI-NEXT: v_mov_b32_e32 v20, s2 +; VI-NEXT: v_mov_b32_e32 v22, s0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; VI-NEXT: v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 +; VI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v7, v2 ; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 -; VI-NEXT: flat_store_dwordx4 v[7:8], v[3:6] -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v11 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 +; VI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v10 +; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v3 +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v7 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v17 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 -; VI-NEXT: v_mov_b32_e32 v17, s1 -; VI-NEXT: v_mov_b32_e32 v16, s0 -; VI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; VI-NEXT: flat_store_dwordx4 v[16:17], v[4:7] -; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v24 +; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11] +; VI-NEXT: flat_store_dwordx4 v[22:23], v[4:7] +; VI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; VI-NEXT: s_endpgm %val = load <8 x half>, ptr addrspace(1) %in %cvt = fpext <8 x half> %val to <8 x double> @@ -1304,28 +1304,26 @@ ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v5 ; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: s_nop 0 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_mov_b32_e32 v15, s3 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] @@ -1333,16 +1331,19 @@ ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; CI-NEXT: v_mov_b32_e32 v14, s2 ; CI-NEXT: s_add_u32 s2, s0, 0x60 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 ; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v5 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; CI-NEXT: v_mov_b32_e32 v17, s3 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v18 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 ; CI-NEXT: v_mov_b32_e32 v16, s2 @@ -1353,16 +1354,16 @@ ; CI-NEXT: s_add_u32 s0, s0, 64 ; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v19 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 -; CI-NEXT: v_mov_b32_e32 v19, s3 +; CI-NEXT: v_mov_b32_e32 v21, s3 ; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v18, s2 +; CI-NEXT: v_mov_b32_e32 v20, s2 ; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[20:21], v[0:3] ; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; CI-NEXT: s_endpgm ; @@ -1372,85 +1373,84 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v14, s3 -; VI-NEXT: v_mov_b32_e32 v13, s2 +; VI-NEXT: v_mov_b32_e32 v9, s3 +; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v16, s3 -; VI-NEXT: v_mov_b32_e32 v15, s2 +; VI-NEXT: v_mov_b32_e32 v13, s3 +; VI-NEXT: v_mov_b32_e32 v12, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v18, s3 -; VI-NEXT: v_mov_b32_e32 v17, s2 +; VI-NEXT: v_mov_b32_e32 v15, s3 +; VI-NEXT: v_mov_b32_e32 v14, s2 ; VI-NEXT: s_add_u32 s2, s0, 0x50 -; VI-NEXT: v_mov_b32_e32 v12, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v11, s0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 -; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 -; VI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; VI-NEXT: v_mov_b32_e32 v14, s3 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; VI-NEXT: v_mov_b32_e32 v13, s2 +; VI-NEXT: v_mov_b32_e32 v17, s3 +; VI-NEXT: v_mov_b32_e32 v16, s2 ; VI-NEXT: s_add_u32 s2, s0, 64 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] -; VI-NEXT: v_mov_b32_e32 v16, s3 -; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; VI-NEXT: v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; VI-NEXT: v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 -; VI-NEXT: v_mov_b32_e32 v15, s2 +; VI-NEXT: v_mov_b32_e32 v19, s3 +; VI-NEXT: v_mov_b32_e32 v11, s1 +; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 0x70 +; VI-NEXT: v_mov_b32_e32 v10, s0 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] -; VI-NEXT: v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v8 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v9 -; VI-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7] -; VI-NEXT: v_cvt_f32_f16_sdwa v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v9 -; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 -; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v10 -; VI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 -; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 ; VI-NEXT: s_add_u32 s0, s0, 0x60 -; VI-NEXT: flat_store_dwordx4 v[13:14], v[1:4] ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 -; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 -; VI-NEXT: v_mov_b32_e32 v20, s3 -; VI-NEXT: v_mov_b32_e32 v14, s1 -; VI-NEXT: v_mov_b32_e32 v19, s2 -; VI-NEXT: v_mov_b32_e32 v13, s0 -; VI-NEXT: flat_store_dwordx4 v[15:16], v[9:12] -; VI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] -; VI-NEXT: flat_store_dwordx4 v[13:14], v[5:8] +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v20, v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v21, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; VI-NEXT: v_cvt_f32_f16_sdwa v32, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v26, v6 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; VI-NEXT: v_cvt_f32_f16_sdwa v27, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v20 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v21 +; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; VI-NEXT: v_cvt_f32_f16_sdwa v28, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v29, v4 +; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; VI-NEXT: v_cvt_f32_f16_sdwa v30, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v24 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v25 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v22 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v23 +; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v31 +; VI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v32 +; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v29 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v30 +; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v28 +; VI-NEXT: v_mov_b32_e32 v21, s3 +; VI-NEXT: v_mov_b32_e32 v23, s1 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v26 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v27 +; VI-NEXT: v_mov_b32_e32 v20, s2 +; VI-NEXT: v_mov_b32_e32 v22, s0 +; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-NEXT: flat_store_dwordx4 v[18:19], v[4:7] +; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11] +; VI-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; VI-NEXT: s_endpgm %val = load <16 x half>, ptr addrspace(1) %in %cvt = fpext <16 x half> %val to <16 x double> @@ -1706,51 +1706,52 @@ ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v13, s3 -; CI-NEXT: v_mov_b32_e32 v12, s2 ; CI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; CI-NEXT: v_mov_b32_e32 v12, s2 ; CI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v17, s3 +; CI-NEXT: v_mov_b32_e32 v16, s2 ; CI-NEXT: s_waitcnt vmcnt(3) ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v2, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; CI-NEXT: v_or_b32_e32 v0, v0, v18 ; CI-NEXT: v_or_b32_e32 v3, v6, v2 -; CI-NEXT: v_or_b32_e32 v2, v17, v7 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 -; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: v_or_b32_e32 v2, v4, v5 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; CI-NEXT: s_nop 0 +; CI-NEXT: v_or_b32_e32 v1, v10, v4 +; CI-NEXT: v_or_b32_e32 v0, v8, v5 ; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_or_b32_e32 v1, v10, v6 -; CI-NEXT: v_or_b32_e32 v0, v8, v7 -; CI-NEXT: v_or_b32_e32 v3, v14, v9 -; CI-NEXT: v_or_b32_e32 v2, v12, v11 +; CI-NEXT: v_or_b32_e32 v3, v14, v6 +; CI-NEXT: v_or_b32_e32 v2, v12, v7 ; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm @@ -1766,29 +1767,31 @@ ; VI-NEXT: s_add_u32 s4, s2, 48 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v9, s3 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: s_add_u32 s2, s2, 16 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: v_mov_b32_e32 v12, s2 ; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; VI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v17, s3 +; VI-NEXT: v_mov_b32_e32 v16, s2 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; VI-NEXT: v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_sdwa v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; VI-NEXT: v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v18, v4 +; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v10, v10 @@ -1799,19 +1802,17 @@ ; VI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; VI-NEXT: v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: v_or_b32_e32 v0, v0, v18 ; VI-NEXT: v_or_b32_e32 v3, v6, v7 -; VI-NEXT: v_or_b32_e32 v2, v18, v17 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_or_b32_e32 v2, v4, v5 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_or_b32_e32 v1, v10, v11 ; VI-NEXT: v_or_b32_e32 v0, v8, v9 ; VI-NEXT: v_or_b32_e32 v3, v14, v15 ; VI-NEXT: v_or_b32_e32 v2, v12, v13 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %val = load <16 x float>, ptr addrspace(1) %in diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2931,8 +2931,7 @@ ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15 -; GFX8-NEXT: v_mul_lo_u16_e32 v15, v16, v18 +; GFX8-NEXT: v_mul_lo_u16_e32 v20, v16, v18 ; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 @@ -2940,7 +2939,8 @@ ; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19 ; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11 -; GFX8-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15 ; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -1067,585 +1067,560 @@ ; GCN-NEXT: s_lshr_b32 s42, s7, 22 ; GCN-NEXT: s_lshr_b32 s43, s7, 23 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x77 -; GCN-NEXT: v_mov_b32_e32 v14, s43 +; GCN-NEXT: v_mov_b32_e32 v15, s43 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x76 -; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc -; GCN-NEXT: v_mov_b32_e32 v17, s42 +; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc +; GCN-NEXT: v_mov_b32_e32 v18, s42 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 +; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_and_b32_e32 v18, 1, v18 +; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 +; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x75 -; GCN-NEXT: v_or_b32_e32 v14, v14, v17 -; GCN-NEXT: v_mov_b32_e32 v17, s41 +; GCN-NEXT: v_or_b32_e32 v15, v15, v18 +; GCN-NEXT: v_mov_b32_e32 v18, s41 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x74 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_mov_b32_e32 v18, s40 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 +; GCN-NEXT: v_mov_b32_e32 v19, s40 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 +; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_and_b32_e32 v18, 3, v18 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x73 -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 -; GCN-NEXT: v_mov_b32_e32 v17, s39 +; GCN-NEXT: v_or_b32_e32 v15, v18, v15 +; GCN-NEXT: v_mov_b32_e32 v18, s39 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x72 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_mov_b32_e32 v18, s38 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 +; GCN-NEXT: v_mov_b32_e32 v19, s38 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 +; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x71 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_mov_b32_e32 v18, s37 +; GCN-NEXT: v_or_b32_e32 v18, v18, v19 +; GCN-NEXT: v_mov_b32_e32 v19, s37 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x70 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_mov_b32_e32 v19, s36 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: v_mov_b32_e32 v20, s36 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 +; GCN-NEXT: v_and_b32_e32 v20, 1, v20 +; GCN-NEXT: v_or_b32_e32 v19, v20, v19 +; GCN-NEXT: v_and_b32_e32 v19, 3, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 4, v14 -; GCN-NEXT: v_and_b32_e32 v17, 15, v17 +; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 +; GCN-NEXT: v_and_b32_e32 v18, 15, v18 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7f -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 7, s35 +; GCN-NEXT: v_or_b32_e32 v15, v18, v15 +; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7e -; GCN-NEXT: v_lshrrev_b16_e64 v18, 6, s35 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s35 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 +; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7d -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 5, s35 +; GCN-NEXT: v_or_b32_e32 v18, v18, v19 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7c -; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s35 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s35 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v18, 3, v18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 +; GCN-NEXT: v_and_b32_e32 v20, 1, v20 +; GCN-NEXT: v_or_b32_e32 v19, v20, v19 +; GCN-NEXT: v_and_b32_e32 v19, 3, v19 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7b -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 3, s35 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7a -; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s35 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s35 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x78 -; GCN-NEXT: v_mov_b32_e32 v12, s35 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 +; GCN-NEXT: v_mov_b32_e32 v13, s35 +; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x79 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s35 -; GCN-NEXT: v_cndmask_b32_e32 v12, 1, v12, vcc +; GCN-NEXT: v_or_b32_e32 v19, v19, v20 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s35 +; GCN-NEXT: v_cndmask_b32_e32 v13, 1, v13, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v12, 1, v12 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v12, v12, v19 -; GCN-NEXT: v_and_b32_e32 v12, 3, v12 -; GCN-NEXT: v_or_b32_e32 v18, v12, v18 -; GCN-NEXT: v_mov_b32_e32 v12, 15 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 12, v17 -; GCN-NEXT: v_and_b32_sdwa v18, v18, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: v_and_b32_e32 v13, 1, v13 +; GCN-NEXT: v_lshlrev_b16_e32 v20, 1, v20 +; GCN-NEXT: v_or_b32_e32 v13, v13, v20 +; GCN-NEXT: v_and_b32_e32 v13, 3, v13 +; GCN-NEXT: v_or_b32_e32 v19, v13, v19 +; GCN-NEXT: v_mov_b32_e32 v13, 15 +; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 +; GCN-NEXT: v_and_b32_sdwa v19, v19, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6f -; GCN-NEXT: v_or_b32_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_lshrrev_b16_e64 v17, 15, s7 +; GCN-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: v_lshrrev_b16_e64 v18, 15, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6e -; GCN-NEXT: v_lshrrev_b16_e64 v18, 14, s7 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 14, s7 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 +; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6d -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 13, s7 +; GCN-NEXT: v_or_b32_e32 v18, v18, v19 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 13, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6c -; GCN-NEXT: v_lshrrev_b16_e64 v19, 12, s7 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 12, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v18, 3, v18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 +; GCN-NEXT: v_and_b32_e32 v20, 1, v20 +; GCN-NEXT: v_or_b32_e32 v19, v20, v19 +; GCN-NEXT: v_and_b32_e32 v19, 3, v19 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6b -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 11, s7 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6a -; GCN-NEXT: v_lshrrev_b16_e64 v19, 10, s7 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 10, s7 +; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: v_and_b32_e32 v20, 1, v20 +; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x69 +; GCN-NEXT: v_or_b32_e32 v19, v19, v20 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 9, s7 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x68 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 8, s7 +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v20, 1, v20 +; GCN-NEXT: v_and_b32_e32 v17, 1, v17 +; GCN-NEXT: v_or_b32_e32 v17, v17, v20 +; GCN-NEXT: v_and_b32_e32 v17, 3, v17 +; GCN-NEXT: v_or_b32_e32 v17, v17, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 +; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x67 +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s7 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x66 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s7 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x69 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x65 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 9, s7 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x68 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 8, s7 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x64 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 +; GCN-NEXT: v_and_b32_e32 v20, 1, v20 +; GCN-NEXT: v_or_b32_e32 v19, v20, v19 +; GCN-NEXT: v_and_b32_e32 v19, 3, v19 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x63 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s7 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x62 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s7 +; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: v_and_b32_e32 v20, 1, v20 +; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x61 +; GCN-NEXT: v_or_b32_e32 v19, v19, v20 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s7 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x60 +; GCN-NEXT: v_mov_b32_e32 v16, s7 +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v20, 1, v20 ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v19 +; GCN-NEXT: v_or_b32_e32 v16, v16, v20 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 +; GCN-NEXT: v_or_b32_e32 v16, v16, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v18, 4, v18 +; GCN-NEXT: v_and_b32_e32 v16, 15, v16 ; GCN-NEXT: v_or_b32_e32 v16, v16, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 12, v17 -; GCN-NEXT: v_and_b32_sdwa v16, v16, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x67 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 7, s7 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x57 +; GCN-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: v_mov_b32_e32 v17, s34 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x66 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 6, s7 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x56 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc +; GCN-NEXT: v_mov_b32_e32 v18, s33 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x65 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x55 ; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 5, s7 +; GCN-NEXT: v_mov_b32_e32 v18, s31 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x64 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s7 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x54 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_mov_b32_e32 v19, s30 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x63 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x53 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 3, s7 +; GCN-NEXT: v_mov_b32_e32 v18, s29 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x62 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s7 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x52 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_mov_b32_e32 v19, s28 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x61 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x51 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s7 +; GCN-NEXT: v_mov_b32_e32 v19, s27 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x60 -; GCN-NEXT: v_mov_b32_e32 v15, s7 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x50 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_mov_b32_e32 v20, s26 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v15, 1, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v19 -; GCN-NEXT: v_and_b32_e32 v15, 3, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v18 +; GCN-NEXT: v_and_b32_e32 v20, 1, v20 +; GCN-NEXT: v_or_b32_e32 v19, v20, v19 +; GCN-NEXT: v_and_b32_e32 v19, 3, v19 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 4, v17 -; GCN-NEXT: v_and_b32_e32 v15, 15, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v17 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x57 -; GCN-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v16, s34 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x56 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_mov_b32_e32 v17, s33 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x55 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_mov_b32_e32 v17, s31 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x54 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_mov_b32_e32 v18, s30 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 +; GCN-NEXT: v_and_b32_e32 v18, 15, v18 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5f ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x53 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_mov_b32_e32 v17, s29 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x52 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_mov_b32_e32 v18, s28 +; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5e +; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s25 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x51 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_mov_b32_e32 v18, s27 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x50 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_mov_b32_e32 v19, s26 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 4, v16 -; GCN-NEXT: v_and_b32_e32 v17, 15, v17 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5f -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 7, s25 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5e -; GCN-NEXT: v_lshrrev_b16_e64 v18, 6, s25 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 +; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 +; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5d -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 5, s25 +; GCN-NEXT: v_or_b32_e32 v18, v18, v19 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5c -; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s25 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s25 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v18, 3, v18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 +; GCN-NEXT: v_and_b32_e32 v20, 1, v20 +; GCN-NEXT: v_or_b32_e32 v19, v20, v19 +; GCN-NEXT: v_and_b32_e32 v19, 3, v19 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5b -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 3, s25 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5a -; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s25 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s25 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x58 ; GCN-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x59 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s25 +; GCN-NEXT: v_or_b32_e32 v19, v19, v20 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s25 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: v_and_b32_e32 v3, 1, v3 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v3, v3, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v20, 1, v20 +; GCN-NEXT: v_or_b32_e32 v3, v3, v20 ; GCN-NEXT: v_and_b32_e32 v3, 3, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 12, v17 -; GCN-NEXT: v_and_b32_sdwa v3, v3, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: v_or_b32_e32 v3, v17, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 +; GCN-NEXT: v_and_b32_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_or_b32_e32 v3, v18, v3 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4f -; GCN-NEXT: v_or_b32_sdwa v16, v16, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: v_or_b32_sdwa v17, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v3, 15, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4e -; GCN-NEXT: v_lshrrev_b16_e64 v17, 14, s6 +; GCN-NEXT: v_lshrrev_b16_e64 v18, 14, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 +; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 +; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4d -; GCN-NEXT: v_or_b32_e32 v3, v3, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 13, s6 +; GCN-NEXT: v_or_b32_e32 v3, v3, v18 +; GCN-NEXT: v_lshrrev_b16_e64 v18, 13, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4c -; GCN-NEXT: v_lshrrev_b16_e64 v18, 12, s6 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 12, s6 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 +; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_and_b32_e32 v18, 3, v18 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4b -; GCN-NEXT: v_or_b32_e32 v3, v17, v3 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 11, s6 +; GCN-NEXT: v_or_b32_e32 v3, v18, v3 +; GCN-NEXT: v_lshrrev_b16_e64 v18, 11, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4a -; GCN-NEXT: v_lshrrev_b16_e64 v18, 10, s6 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 10, s6 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 +; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x49 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 9, s6 +; GCN-NEXT: v_or_b32_e32 v18, v18, v19 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 9, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x48 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 8, s6 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 8, s6 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 +; GCN-NEXT: v_and_b32_e32 v20, 1, v20 +; GCN-NEXT: v_or_b32_e32 v19, v20, v19 +; GCN-NEXT: v_and_b32_e32 v19, 3, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 12, v3 -; GCN-NEXT: v_and_b32_sdwa v17, v17, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_and_b32_sdwa v18, v18, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x47 -; GCN-NEXT: v_or_b32_e32 v17, v3, v17 +; GCN-NEXT: v_or_b32_e32 v18, v3, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v3, 7, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x46 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 6, s6 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 +; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 +; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x45 -; GCN-NEXT: v_or_b32_e32 v3, v3, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 5, s6 +; GCN-NEXT: v_or_b32_e32 v3, v3, v19 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x44 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s6 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s6 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v18, 3, v18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 +; GCN-NEXT: v_and_b32_e32 v20, 1, v20 +; GCN-NEXT: v_or_b32_e32 v19, v20, v19 +; GCN-NEXT: v_and_b32_e32 v19, 3, v19 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x43 -; GCN-NEXT: v_or_b32_e32 v18, v18, v3 +; GCN-NEXT: v_or_b32_e32 v19, v19, v3 ; GCN-NEXT: v_lshrrev_b16_e64 v3, 3, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x42 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s6 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc +; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 ; GCN-NEXT: s_cmpk_lg_i32 s0, 0x41 -; GCN-NEXT: v_or_b32_e32 v3, v3, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s6 +; GCN-NEXT: v_or_b32_e32 v3, v3, v20 +; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 64 ; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 +; GCN-NEXT: v_lshlrev_b16_e32 v20, 1, v20 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v19 +; GCN-NEXT: v_or_b32_e32 v2, v2, v20 ; GCN-NEXT: v_and_b32_e32 v2, 3, v2 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: v_or_b32_sdwa v3, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_lshlrev_b16_e32 v14, 4, v18 +; GCN-NEXT: v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v19 ; GCN-NEXT: v_and_b32_e32 v2, 15, v2 ; GCN-NEXT: s_cmp_lg_u32 s0, 55 -; GCN-NEXT: v_or_b32_e32 v2, v2, v14 -; GCN-NEXT: v_mov_b32_e32 v14, s24 +; GCN-NEXT: v_or_b32_e32 v2, v2, v15 +; GCN-NEXT: v_mov_b32_e32 v15, s24 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 54 -; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc -; GCN-NEXT: v_mov_b32_e32 v15, s23 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc -; GCN-NEXT: v_and_b32_e32 v15, 1, v15 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 2, v15 -; GCN-NEXT: s_cmp_lg_u32 s0, 53 -; GCN-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: v_mov_b32_e32 v15, s22 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 52 -; GCN-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v16, s21 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v15, 1, v15 -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_and_b32_e32 v15, 3, v15 -; GCN-NEXT: s_cmp_lg_u32 s0, 51 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_mov_b32_e32 v15, s20 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 50 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v16, s19 +; GCN-NEXT: v_mov_b32_e32 v16, s23 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 49 +; GCN-NEXT: s_cmp_lg_u32 s0, 53 +; GCN-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_mov_b32_e32 v16, s18 +; GCN-NEXT: v_mov_b32_e32 v16, s22 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 48 +; GCN-NEXT: s_cmp_lg_u32 s0, 52 +; GCN-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_mov_b32_e32 v17, s17 +; GCN-NEXT: v_mov_b32_e32 v17, s21 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_lshlrev_b16_e32 v16, 1, v16 ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 +; GCN-NEXT: s_cmp_lg_u32 s0, 51 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 4, v14 -; GCN-NEXT: v_and_b32_e32 v15, 15, v15 -; GCN-NEXT: s_cmp_lg_u32 s0, 63 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_lshrrev_b16_e64 v15, 7, s16 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 62 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 6, s16 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc +; GCN-NEXT: v_mov_b32_e32 v16, s20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 50 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 61 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 5, s16 +; GCN-NEXT: v_mov_b32_e32 v17, s19 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 60 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 4, s16 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc +; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc +; GCN-NEXT: v_and_b32_e32 v17, 1, v17 +; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 +; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 +; GCN-NEXT: s_cmp_lg_u32 s0, 49 +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: v_mov_b32_e32 v17, s18 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 48 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v16, 1, v16 -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 +; GCN-NEXT: v_mov_b32_e32 v18, s17 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 +; GCN-NEXT: v_and_b32_e32 v18, 1, v18 +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: v_and_b32_e32 v17, 3, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 59 +; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 +; GCN-NEXT: v_and_b32_e32 v16, 15, v16 +; GCN-NEXT: s_cmp_lg_u32 s0, 63 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 3, s16 +; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 58 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 2, s16 +; GCN-NEXT: s_cmp_lg_u32 s0, 62 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s16 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 56 -; GCN-NEXT: v_mov_b32_e32 v13, s16 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 57 +; GCN-NEXT: s_cmp_lg_u32 s0, 61 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 1, s16 -; GCN-NEXT: v_cndmask_b32_e32 v13, 1, v13, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 60 +; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v13, 1, v13 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_or_b32_e32 v13, v13, v17 -; GCN-NEXT: v_and_b32_e32 v13, 3, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 12, v15 -; GCN-NEXT: v_and_b32_sdwa v13, v13, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: s_cmp_lg_u32 s0, 47 -; GCN-NEXT: v_or_b32_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_lshrrev_b16_e64 v13, 15, s5 +; GCN-NEXT: v_and_b32_e32 v18, 1, v18 +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: v_and_b32_e32 v17, 3, v17 +; GCN-NEXT: s_cmp_lg_u32 s0, 59 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 46 -; GCN-NEXT: v_lshrrev_b16_e64 v15, 14, s5 -; GCN-NEXT: v_cndmask_b32_e32 v13, 1, v13, vcc +; GCN-NEXT: s_cmp_lg_u32 s0, 58 +; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s16 +; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc -; GCN-NEXT: v_and_b32_e32 v15, 1, v15 -; GCN-NEXT: v_lshlrev_b16_e32 v13, 3, v13 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 2, v15 -; GCN-NEXT: s_cmp_lg_u32 s0, 45 -; GCN-NEXT: v_or_b32_e32 v13, v13, v15 -; GCN-NEXT: v_lshrrev_b16_e64 v15, 13, s5 +; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_and_b32_e32 v18, 1, v18 +; GCN-NEXT: s_cmp_lg_u32 s0, 56 +; GCN-NEXT: v_mov_b32_e32 v14, s16 +; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 +; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 44 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 12, s5 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc +; GCN-NEXT: s_cmp_lg_u32 s0, 57 +; GCN-NEXT: v_or_b32_e32 v17, v17, v18 +; GCN-NEXT: v_lshrrev_b16_e64 v18, 1, s16 +; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v15, 1, v15 -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_and_b32_e32 v15, 3, v15 -; GCN-NEXT: s_cmp_lg_u32 s0, 43 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: v_lshrrev_b16_e64 v15, 11, s5 +; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_and_b32_e32 v14, 1, v14 +; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 +; GCN-NEXT: v_or_b32_e32 v14, v14, v18 +; GCN-NEXT: v_and_b32_e32 v14, 3, v14 +; GCN-NEXT: v_or_b32_e32 v14, v14, v17 +; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 +; GCN-NEXT: v_and_b32_sdwa v14, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_or_b32_e32 v14, v16, v14 +; GCN-NEXT: s_cmp_lg_u32 s0, 47 +; GCN-NEXT: v_or_b32_sdwa v15, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: v_lshrrev_b16_e64 v14, 15, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 42 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 10, s5 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc +; GCN-NEXT: s_cmp_lg_u32 s0, 46 +; GCN-NEXT: v_lshrrev_b16_e64 v16, 14, s5 +; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 +; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 41 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 9, s5 +; GCN-NEXT: s_cmp_lg_u32 s0, 45 +; GCN-NEXT: v_or_b32_e32 v14, v14, v16 +; GCN-NEXT: v_lshrrev_b16_e64 v16, 13, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 40 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 8, s5 +; GCN-NEXT: s_cmp_lg_u32 s0, 44 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 12, s5 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc @@ -1653,211 +1628,236 @@ ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_lshlrev_b16_e32 v13, 12, v13 -; GCN-NEXT: v_and_b32_sdwa v15, v15, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: s_cmp_lg_u32 s0, 43 +; GCN-NEXT: v_or_b32_e32 v14, v16, v14 +; GCN-NEXT: v_lshrrev_b16_e64 v16, 11, s5 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 42 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 10, s5 +; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc +; GCN-NEXT: v_and_b32_e32 v17, 1, v17 +; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 +; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 +; GCN-NEXT: s_cmp_lg_u32 s0, 41 +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 9, s5 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 40 +; GCN-NEXT: v_lshrrev_b16_e64 v18, 8, s5 +; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 +; GCN-NEXT: v_and_b32_e32 v18, 1, v18 +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: v_and_b32_e32 v17, 3, v17 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_lshlrev_b16_e32 v14, 12, v14 +; GCN-NEXT: v_and_b32_sdwa v16, v16, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: s_cmp_lg_u32 s0, 39 -; GCN-NEXT: v_or_b32_e32 v15, v13, v15 -; GCN-NEXT: v_lshrrev_b16_e64 v13, 7, s5 +; GCN-NEXT: v_or_b32_e32 v16, v14, v16 +; GCN-NEXT: v_lshrrev_b16_e64 v14, 7, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 38 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 6, s5 -; GCN-NEXT: v_cndmask_b32_e32 v13, 1, v13, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s5 +; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v13, 3, v13 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 +; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc +; GCN-NEXT: v_and_b32_e32 v17, 1, v17 +; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 +; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 ; GCN-NEXT: s_cmp_lg_u32 s0, 37 -; GCN-NEXT: v_or_b32_e32 v13, v13, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 5, s5 +; GCN-NEXT: v_or_b32_e32 v14, v14, v17 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 36 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 4, s5 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s5 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v16, 1, v16 -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v16, 3, v16 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 +; GCN-NEXT: v_and_b32_e32 v18, 1, v18 +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: v_and_b32_e32 v17, 3, v17 ; GCN-NEXT: s_cmp_lg_u32 s0, 35 -; GCN-NEXT: v_or_b32_e32 v16, v16, v13 -; GCN-NEXT: v_lshrrev_b16_e64 v13, 3, s5 +; GCN-NEXT: v_or_b32_e32 v17, v17, v14 +; GCN-NEXT: v_lshrrev_b16_e64 v14, 3, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 34 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 2, s5 -; GCN-NEXT: v_cndmask_b32_e32 v13, 1, v13, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s5 +; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v13, 3, v13 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 +; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_and_b32_e32 v18, 1, v18 +; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 +; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 ; GCN-NEXT: s_cmp_lg_u32 s0, 33 -; GCN-NEXT: v_or_b32_e32 v17, v13, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v13, 1, s5 +; GCN-NEXT: v_or_b32_e32 v18, v14, v18 +; GCN-NEXT: v_lshrrev_b16_e64 v14, 1, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 32 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_cndmask_b32_e32 v13, 1, v13, vcc +; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v13, 1, v13 +; GCN-NEXT: v_lshlrev_b16_e32 v14, 1, v14 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v13 +; GCN-NEXT: v_or_b32_e32 v1, v1, v14 ; GCN-NEXT: v_and_b32_e32 v1, 3, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 4, v16 +; GCN-NEXT: v_or_b32_e32 v1, v1, v18 +; GCN-NEXT: v_lshlrev_b16_e32 v17, 4, v17 ; GCN-NEXT: v_and_b32_e32 v1, 15, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v16 -; GCN-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: v_or_b32_e32 v1, v1, v17 +; GCN-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: s_cmp_lg_u32 s0, 23 -; GCN-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v14, s15 +; GCN-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GCN-NEXT: v_mov_b32_e32 v15, s15 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 22 -; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc -; GCN-NEXT: v_mov_b32_e32 v15, s14 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc -; GCN-NEXT: v_and_b32_e32 v15, 1, v15 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 2, v15 -; GCN-NEXT: s_cmp_lg_u32 s0, 21 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: v_mov_b32_e32 v15, s13 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 20 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v16, s12 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v15, 1, v15 -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_and_b32_e32 v15, 3, v15 -; GCN-NEXT: s_cmp_lg_u32 s0, 19 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_mov_b32_e32 v15, s11 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 18 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v16, s10 +; GCN-NEXT: v_mov_b32_e32 v16, s14 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 17 +; GCN-NEXT: s_cmp_lg_u32 s0, 21 ; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_mov_b32_e32 v16, s9 +; GCN-NEXT: v_mov_b32_e32 v16, s13 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 16 +; GCN-NEXT: s_cmp_lg_u32 s0, 20 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_mov_b32_e32 v18, s8 +; GCN-NEXT: v_mov_b32_e32 v17, s12 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_lshlrev_b16_e32 v16, 1, v16 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v16, v18, v16 +; GCN-NEXT: v_and_b32_e32 v17, 1, v17 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 +; GCN-NEXT: s_cmp_lg_u32 s0, 19 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 4, v14 -; GCN-NEXT: v_and_b32_e32 v15, 15, v15 +; GCN-NEXT: v_mov_b32_e32 v16, s11 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 18 +; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc +; GCN-NEXT: v_mov_b32_e32 v17, s10 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc +; GCN-NEXT: v_and_b32_e32 v17, 1, v17 +; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 +; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 +; GCN-NEXT: s_cmp_lg_u32 s0, 17 +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: v_mov_b32_e32 v17, s9 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 16 +; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc +; GCN-NEXT: v_mov_b32_e32 v19, s8 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 +; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: v_or_b32_e32 v17, v19, v17 +; GCN-NEXT: v_and_b32_e32 v17, 3, v17 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 +; GCN-NEXT: v_and_b32_e32 v16, 15, v16 ; GCN-NEXT: s_cmp_lg_u32 s0, 31 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_lshrrev_b16_e64 v15, 7, s1 +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 30 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 6, s1 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s1 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc +; GCN-NEXT: v_and_b32_e32 v17, 1, v17 +; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 +; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 ; GCN-NEXT: s_cmp_lg_u32 s0, 29 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 5, s1 +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 28 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s1 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s1 +; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v16, 1, v16 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v16, v18, v16 -; GCN-NEXT: v_and_b32_e32 v16, 3, v16 +; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 +; GCN-NEXT: v_and_b32_e32 v19, 1, v19 +; GCN-NEXT: v_or_b32_e32 v17, v19, v17 +; GCN-NEXT: v_and_b32_e32 v17, 3, v17 ; GCN-NEXT: s_cmp_lg_u32 s0, 27 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 3, s1 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 26 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s1 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s1 +; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 +; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: s_cmp_lg_u32 s0, 24 -; GCN-NEXT: v_mov_b32_e32 v17, s1 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 +; GCN-NEXT: v_mov_b32_e32 v18, s1 +; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 +; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 25 -; GCN-NEXT: v_or_b32_e32 v16, v16, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 1, s1 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_or_b32_e32 v17, v17, v19 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s1 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 12, v15 -; GCN-NEXT: v_and_b32_sdwa v16, v16, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc +; GCN-NEXT: v_and_b32_e32 v18, 1, v18 +; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 +; GCN-NEXT: v_or_b32_e32 v18, v18, v19 +; GCN-NEXT: v_and_b32_e32 v18, 3, v18 +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 +; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: s_cmp_lg_u32 s0, 15 -; GCN-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_lshrrev_b16_e64 v15, 15, s4 +; GCN-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: v_lshrrev_b16_e64 v16, 15, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 14 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 14, s4 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 14, s4 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc +; GCN-NEXT: v_and_b32_e32 v17, 1, v17 +; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 +; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 ; GCN-NEXT: s_cmp_lg_u32 s0, 13 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 13, s4 +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 13, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 12 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 12, s4 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshrrev_b16_e64 v18, 12, s4 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v16, 1, v16 -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc +; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 +; GCN-NEXT: v_and_b32_e32 v18, 1, v18 +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: s_cmp_lg_u32 s0, 11 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 11, s4 -; GCN-NEXT: v_and_b32_e32 v16, 3, v16 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s4 +; GCN-NEXT: v_and_b32_e32 v17, 3, v17 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 10 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 10, s4 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v17, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v14, 10, s4 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 9 -; GCN-NEXT: v_lshrrev_b16_e64 v13, 9, s4 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v18, vcc +; GCN-NEXT: v_lshrrev_b16_e64 v12, 9, s4 +; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 8 ; GCN-NEXT: v_lshrrev_b16_e64 v11, 8, s4 -; GCN-NEXT: v_cndmask_b32_e32 v13, 1, v13, vcc +; GCN-NEXT: v_cndmask_b32_e32 v12, 1, v12, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 7 ; GCN-NEXT: v_lshrrev_b16_e64 v10, 7, s4 @@ -1892,8 +1892,8 @@ ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v13, 1, v13 +; GCN-NEXT: v_and_b32_e32 v14, 1, v14 +; GCN-NEXT: v_lshlrev_b16_e32 v12, 1, v12 ; GCN-NEXT: v_and_b32_e32 v11, 1, v11 ; GCN-NEXT: v_and_b32_e32 v9, 1, v9 ; GCN-NEXT: v_lshlrev_b16_e32 v8, 1, v8 @@ -1901,33 +1901,33 @@ ; GCN-NEXT: v_and_b32_e32 v5, 1, v5 ; GCN-NEXT: v_lshlrev_b16_e32 v4, 1, v4 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: v_or_b32_e32 v11, v11, v13 +; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 +; GCN-NEXT: v_lshlrev_b16_e32 v14, 2, v14 +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 ; GCN-NEXT: v_lshlrev_b16_e32 v10, 3, v10 ; GCN-NEXT: v_lshlrev_b16_e32 v9, 2, v9 ; GCN-NEXT: v_or_b32_e32 v7, v7, v8 ; GCN-NEXT: v_lshlrev_b16_e32 v6, 3, v6 ; GCN-NEXT: v_lshlrev_b16_e32 v5, 2, v5 ; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: v_or_b32_e32 v14, v17, v14 ; GCN-NEXT: v_and_b32_e32 v11, 3, v11 ; GCN-NEXT: v_or_b32_e32 v9, v10, v9 ; GCN-NEXT: v_and_b32_e32 v7, 3, v7 ; GCN-NEXT: v_or_b32_e32 v5, v6, v5 ; GCN-NEXT: v_and_b32_e32 v0, 3, v0 -; GCN-NEXT: v_or_b32_e32 v11, v11, v16 +; GCN-NEXT: v_or_b32_e32 v11, v11, v14 ; GCN-NEXT: v_or_b32_e32 v7, v7, v9 ; GCN-NEXT: v_or_b32_e32 v0, v0, v5 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 12, v15 -; GCN-NEXT: v_and_b32_sdwa v11, v11, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 +; GCN-NEXT: v_and_b32_sdwa v11, v11, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_lshlrev_b16_e32 v7, 4, v7 ; GCN-NEXT: v_and_b32_e32 v0, 15, v0 -; GCN-NEXT: v_or_b32_e32 v11, v15, v11 +; GCN-NEXT: v_or_b32_e32 v11, v16, v11 ; GCN-NEXT: v_or_b32_e32 v0, v0, v7 ; GCN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GCN-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2801,14 +2801,16 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc +; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 14 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc +; GFX9-NEXT: v_perm_b32 v1, v10, v1, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 15 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2816,32 +2818,30 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 12 -; GFX9-NEXT: v_perm_b32 v0, v12, v0, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v9, vcc +; GFX9-NEXT: v_perm_b32 v0, v10, v0, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 10 -; GFX9-NEXT: v_perm_b32 v7, v12, v7, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc +; GFX9-NEXT: v_perm_b32 v7, v10, v7, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 11 -; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 8 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc +; GFX9-NEXT: v_perm_b32 v6, v10, v6, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 9 -; GFX9-NEXT: v_perm_b32 v1, v11, v1, s2 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; GFX9-NEXT: v_perm_b32 v6, v12, v6, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc ; GFX9-NEXT: v_perm_b32 v5, v10, v5, s2 ; GFX9-NEXT: v_perm_b32 v4, v9, v4, s2 ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 @@ -2851,7 +2851,7 @@ ; VI-LABEL: v_insertelement_v16f16_dynamic: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -2865,81 +2865,81 @@ ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_cmp_eq_u32 s7, 14 +; VI-NEXT: s_cmp_eq_u32 s5, 14 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc -; VI-NEXT: v_mov_b32_e32 v12, s6 +; VI-NEXT: v_mov_b32_e32 v12, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 15 +; VI-NEXT: s_cmp_eq_u32 s5, 15 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 12 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 12 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 13 +; VI-NEXT: s_cmp_eq_u32 s5, 13 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 10 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 11 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 10 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 11 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 8 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] -; VI-NEXT: s_cmp_eq_u32 s7, 9 +; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 8 +; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 9 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 6 -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s5, 6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v13, v16, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 7 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] +; VI-NEXT: s_cmp_eq_u32 s5, 7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 4 -; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s5, 4 +; VI-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 5 -; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; VI-NEXT: s_cmp_eq_u32 s5, 5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 2 -; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 3 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s5, 2 +; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v13, v18, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 3 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 1 +; VI-NEXT: s_cmp_eq_u32 s5, 0 ; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v13, v19, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 1 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_cndmask_b32_e32 v12, v20, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] @@ -2972,101 +2972,101 @@ ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; CI-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 11 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; CI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc -; CI-NEXT: v_cndmask_b32_e64 v12, v12, v10, s[2:3] ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 10 -; CI-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; CI-NEXT: v_cndmask_b32_e32 v13, v13, v10, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; CI-NEXT: v_or_b32_e32 v2, v2, v12 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; CI-NEXT: v_or_b32_e32 v1, v1, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v15 ; CI-NEXT: s_cmp_eq_u32 s5, 9 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 8 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v16 -; CI-NEXT: v_cndmask_b32_e32 v12, v12, v10, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cndmask_b32_e32 v14, v14, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 7 +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 6 -; CI-NEXT: v_cndmask_b32_e32 v13, v13, v10, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cndmask_b32_e32 v15, v15, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 5 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; CI-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 4 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cndmask_b32_e32 v14, v14, v10, vcc +; CI-NEXT: v_or_b32_e32 v3, v3, v11 +; CI-NEXT: v_cndmask_b32_e32 v11, v16, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; CI-NEXT: v_cndmask_b32_e64 v12, v12, v10, s[2:3] ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_or_b32_e32 v6, v6, v11 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: v_or_b32_e32 v3, v3, v11 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; CI-NEXT: v_or_b32_e32 v0, v0, v12 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; CI-NEXT: s_cmp_eq_u32 s5, 3 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_or_b32_e32 v7, v7, v12 +; CI-NEXT: v_or_b32_e32 v2, v2, v12 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_or_b32_e32 v6, v6, v12 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; CI-NEXT: s_cmp_eq_u32 s5, 3 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 2 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc +; CI-NEXT: v_or_b32_e32 v0, v0, v12 +; CI-NEXT: v_cndmask_b32_e32 v12, v17, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 1 ; CI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v12, v12, v10, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; CI-NEXT: v_or_b32_e32 v5, v5, v10 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; CI-NEXT: v_or_b32_e32 v1, v1, v13 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; CI-NEXT: v_or_b32_e32 v5, v5, v10 +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; CI-NEXT: v_or_b32_e32 v7, v7, v13 ; CI-NEXT: v_or_b32_e32 v4, v4, v10 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; CI-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir --- a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir +++ b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir @@ -36,6 +36,14 @@ ; GCN-NEXT: [[V_CVT_F64_I32_e32_2:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY2]], implicit $mode, implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_3:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY3]], implicit $mode, implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_4:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY4]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_5:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY5]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_6:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY6]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_7:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY7]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_8:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY8]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_9:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY9]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_10:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY10]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_11:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY11]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_12:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY12]], implicit $mode, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) @@ -47,22 +55,14 @@ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_2]], implicit $exec ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_3]], implicit $exec ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_4]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_5:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY5]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_5]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_6:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY6]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_6]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_7:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY7]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_7]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_8:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY8]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_8]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_9:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY9]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_9]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_10:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY10]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_10]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_11:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY11]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_11]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_12:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY12]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_12]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_5]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_6]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_7]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_8]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_9]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_10]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_11]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_12]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_13:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY13]], implicit $mode, implicit $exec ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_13]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_14:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY14]], implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -493,51 +493,51 @@ ; SI-NEXT: s_bfe_u32 s7, s19, 0xb0014 ; SI-NEXT: s_addk_i32 s7, 0xfc01 ; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s7 +; SI-NEXT: v_mov_b32_e32 v13, s5 ; SI-NEXT: s_andn2_b64 s[8:9], s[18:19], s[8:9] ; SI-NEXT: s_and_b32 s10, s19, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v12, s4 ; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: v_add_f64 v[12:13], s[12:13], -v[12:13] ; SI-NEXT: s_cselect_b32 s8, 0, s8 ; SI-NEXT: s_cselect_b32 s9, s10, s9 ; SI-NEXT: s_cmp_gt_i32 s7, 51 ; SI-NEXT: s_cselect_b32 s9, s19, s9 -; SI-NEXT: s_cselect_b32 s8, s18, s8 -; SI-NEXT: s_bfe_u32 s7, s17, 0xb0014 -; SI-NEXT: v_mov_b32_e32 v13, s5 -; SI-NEXT: s_addk_i32 s7, 0xfc01 -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], s7 -; SI-NEXT: v_add_f64 v[12:13], s[12:13], -v[12:13] -; SI-NEXT: s_andn2_b64 s[10:11], s[16:17], s[10:11] -; SI-NEXT: s_and_b32 s12, s17, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s7, 0 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[12:13]|, 0.5 +; SI-NEXT: s_cselect_b32 s8, s18, s8 ; SI-NEXT: v_mov_b32_e32 v13, s9 -; SI-NEXT: s_cselect_b32 s10, 0, s10 -; SI-NEXT: s_cselect_b32 s11, s12, s11 -; SI-NEXT: s_cmp_gt_i32 s7, 51 ; SI-NEXT: v_mov_b32_e32 v12, s8 -; SI-NEXT: s_cselect_b32 s11, s17, s11 ; SI-NEXT: v_mov_b32_e32 v9, s13 ; SI-NEXT: v_add_f64 v[12:13], s[18:19], -v[12:13] -; SI-NEXT: s_cselect_b32 s10, s16, s10 -; SI-NEXT: v_mov_b32_e32 v15, s11 ; SI-NEXT: v_bfi_b32 v9, s6, v8, v9 -; SI-NEXT: v_mov_b32_e32 v14, s10 ; SI-NEXT: v_cndmask_b32_e32 v17, 0, v9, vcc ; SI-NEXT: v_mov_b32_e32 v9, s19 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[12:13]|, 0.5 -; SI-NEXT: v_add_f64 v[14:15], s[16:17], -v[14:15] ; SI-NEXT: v_bfi_b32 v9, s6, v8, v9 +; SI-NEXT: s_bfe_u32 s7, s17, 0xb0014 ; SI-NEXT: v_cndmask_b32_e32 v13, 0, v9, vcc +; SI-NEXT: v_mov_b32_e32 v12, 0 +; SI-NEXT: s_addk_i32 s7, 0xfc01 +; SI-NEXT: v_add_f64 v[14:15], s[8:9], v[12:13] +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s7 +; SI-NEXT: s_andn2_b64 s[8:9], s[16:17], s[8:9] +; SI-NEXT: s_and_b32 s10, s17, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_cselect_b32 s8, 0, s8 +; SI-NEXT: s_cselect_b32 s9, s10, s9 +; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cselect_b32 s9, s17, s9 +; SI-NEXT: s_cselect_b32 s8, s16, s8 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: v_add_f64 v[12:13], s[16:17], -v[12:13] ; SI-NEXT: v_mov_b32_e32 v9, s17 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[12:13]|, 0.5 ; SI-NEXT: v_bfi_b32 v8, s6, v8, v9 -; SI-NEXT: v_mov_b32_e32 v12, 0 ; SI-NEXT: v_cndmask_b32_e32 v9, 0, v8, vcc ; SI-NEXT: v_mov_b32_e32 v8, 0 ; SI-NEXT: v_mov_b32_e32 v16, 0 -; SI-NEXT: v_add_f64 v[14:15], s[8:9], v[12:13] -; SI-NEXT: v_add_f64 v[12:13], s[10:11], v[8:9] +; SI-NEXT: v_add_f64 v[12:13], s[8:9], v[8:9] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_add_f64 v[8:9], s[4:5], v[16:17] ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 @@ -549,15 +549,15 @@ ; CI-LABEL: round_v8f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 -; CI-NEXT: s_brev_b32 s2, -2 -; CI-NEXT: v_mov_b32_e32 v16, 0x3ff00000 -; CI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s23, 0xf000 +; CI-NEXT: s_brev_b32 s20, -2 +; CI-NEXT: v_mov_b32_e32 v20, 0x3ff00000 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v4, s7 ; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] -; CI-NEXT: v_bfi_b32 v4, s2, v16, v4 +; CI-NEXT: v_bfi_b32 v4, s20, v20, v4 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc @@ -566,7 +566,7 @@ ; CI-NEXT: v_add_f64 v[0:1], s[4:5], -v[4:5] ; CI-NEXT: v_mov_b32_e32 v6, s5 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 -; CI-NEXT: v_bfi_b32 v6, s2, v16, v6 +; CI-NEXT: v_bfi_b32 v6, s20, v20, v6 ; CI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[10:11] ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -574,7 +574,7 @@ ; CI-NEXT: v_add_f64 v[4:5], s[10:11], -v[6:7] ; CI-NEXT: v_mov_b32_e32 v8, s11 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; CI-NEXT: v_bfi_b32 v8, s2, v16, v8 +; CI-NEXT: v_bfi_b32 v8, s20, v20, v8 ; CI-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[8:9] ; CI-NEXT: v_mov_b32_e32 v4, 0 @@ -582,47 +582,47 @@ ; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[8:9] ; CI-NEXT: v_mov_b32_e32 v10, s9 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; CI-NEXT: v_bfi_b32 v10, s2, v16, v10 +; CI-NEXT: v_bfi_b32 v10, s20, v20, v10 ; CI-NEXT: v_cndmask_b32_e32 v5, 0, v10, vcc +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[14:15] ; CI-NEXT: v_mov_b32_e32 v4, 0 ; CI-NEXT: v_add_f64 v[4:5], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v8, s15 -; CI-NEXT: v_bfi_b32 v18, s2, v16, v8 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[16:17] -; CI-NEXT: v_trunc_f64_e32 v[10:11], s[18:19] -; CI-NEXT: v_add_f64 v[14:15], s[16:17], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v19, s19 +; CI-NEXT: v_add_f64 v[8:9], s[14:15], -v[10:11] +; CI-NEXT: v_mov_b32_e32 v12, s15 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5 +; CI-NEXT: v_bfi_b32 v12, s20, v20, v12 +; CI-NEXT: v_trunc_f64_e32 v[16:17], s[12:13] +; CI-NEXT: v_cndmask_b32_e32 v9, 0, v12, vcc +; CI-NEXT: v_mov_b32_e32 v8, 0 +; CI-NEXT: v_add_f64 v[10:11], v[10:11], v[8:9] +; CI-NEXT: v_add_f64 v[8:9], s[12:13], -v[16:17] +; CI-NEXT: v_mov_b32_e32 v12, s13 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5 +; CI-NEXT: v_bfi_b32 v12, s20, v20, v12 +; CI-NEXT: v_cndmask_b32_e32 v9, 0, v12, vcc +; CI-NEXT: v_trunc_f64_e32 v[12:13], s[18:19] +; CI-NEXT: v_mov_b32_e32 v18, s19 +; CI-NEXT: v_add_f64 v[14:15], s[18:19], -v[12:13] +; CI-NEXT: v_bfi_b32 v18, s20, v20, v18 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 -; CI-NEXT: v_add_f64 v[14:15], s[18:19], -v[10:11] -; CI-NEXT: v_mov_b32_e32 v17, s17 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 -; CI-NEXT: v_bfi_b32 v19, s2, v16, v19 -; CI-NEXT: v_trunc_f64_e32 v[12:13], s[12:13] -; CI-NEXT: v_bfi_b32 v17, s2, v16, v17 -; CI-NEXT: v_cndmask_b32_e64 v15, 0, v19, s[0:1] ; CI-NEXT: v_mov_b32_e32 v14, 0 -; CI-NEXT: v_add_f64 v[10:11], v[10:11], v[14:15] -; CI-NEXT: v_cndmask_b32_e32 v15, 0, v17, vcc -; CI-NEXT: v_mov_b32_e32 v14, 0 -; CI-NEXT: v_mov_b32_e32 v17, s13 -; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[14:15] -; CI-NEXT: v_add_f64 v[14:15], s[12:13], -v[12:13] -; CI-NEXT: v_bfi_b32 v19, s2, v16, v17 -; CI-NEXT: v_trunc_f64_e32 v[16:17], s[14:15] -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 -; CI-NEXT: v_add_f64 v[14:15], s[14:15], -v[16:17] -; CI-NEXT: s_mov_b32 s22, -1 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 -; CI-NEXT: v_mov_b32_e32 v14, 0 -; CI-NEXT: v_cndmask_b32_e64 v15, 0, v18, s[0:1] -; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[14:15] -; CI-NEXT: v_cndmask_b32_e32 v17, 0, v19, vcc -; CI-NEXT: v_mov_b32_e32 v16, 0 -; CI-NEXT: v_add_f64 v[12:13], v[12:13], v[16:17] -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; CI-NEXT: v_cndmask_b32_e32 v15, 0, v18, vcc +; CI-NEXT: v_trunc_f64_e32 v[18:19], s[16:17] +; CI-NEXT: v_add_f64 v[14:15], v[12:13], v[14:15] +; CI-NEXT: v_add_f64 v[12:13], s[16:17], -v[18:19] +; CI-NEXT: v_mov_b32_e32 v21, s17 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[12:13]|, 0.5 +; CI-NEXT: v_bfi_b32 v20, s20, v20, v21 +; CI-NEXT: v_cndmask_b32_e32 v13, 0, v20, vcc +; CI-NEXT: v_mov_b32_e32 v12, 0 +; CI-NEXT: v_mov_b32_e32 v8, 0 +; CI-NEXT: v_add_f64 v[12:13], v[18:19], v[12:13] +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_add_f64 v[8:9], v[16:17], v[8:9] +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 store <8 x double> %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -2074,27 +2074,29 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s14, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s1, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s0, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s3, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s0, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff @@ -2104,60 +2106,56 @@ ; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: @@ -2196,32 +2194,32 @@ ; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff -; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff -; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-HSA-NEXT: s_and_b32 s0, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s1, s14, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s31 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s30 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 @@ -2460,90 +2458,88 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s1, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s19, s0, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s20, s1 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s21, s0 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s1, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s0, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s1, s1 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s0, s0 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s22, s3, 16 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s2, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s24, s3 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s25, s2 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s26, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s4, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s3, s3 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s2, s2 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s4, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s28, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s6, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s26, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s6, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s30, s9, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s8, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s28, s9, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s8, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s11, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s10, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s30, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s10, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s36, s12, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s13, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s12, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s15, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s14, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s15, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s36, s14, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32: @@ -2554,6 +2550,8 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_ashr_i32 s18, s1, 16 ; GCN-HSA-NEXT: s_ashr_i32 s19, s0, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s20, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s21, s0 ; GCN-HSA-NEXT: s_ashr_i32 s22, s3, 16 ; GCN-HSA-NEXT: s_ashr_i32 s23, s2, 16 ; GCN-HSA-NEXT: s_ashr_i32 s24, s5, 16 @@ -2566,36 +2564,34 @@ ; GCN-HSA-NEXT: s_ashr_i32 s31, s10, 16 ; GCN-HSA-NEXT: s_ashr_i32 s33, s13, 16 ; GCN-HSA-NEXT: s_ashr_i32 s34, s12, 16 -; GCN-HSA-NEXT: s_ashr_i32 s35, s15, 16 -; GCN-HSA-NEXT: s_ashr_i32 s36, s14, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s21, s0 +; GCN-HSA-NEXT: s_ashr_i32 s0, s15, 16 +; GCN-HSA-NEXT: s_ashr_i32 s1, s14, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: s_sext_i32_i16 s20, s1 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 ; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 ; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 @@ -2910,18 +2906,18 @@ ; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s21, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s20, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s23, s23, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s25, s25, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s24, s24, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s27, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s26, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s29, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s28, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s31, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s30, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s21, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s26, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -2941,21 +2937,22 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s61 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 @@ -3093,96 +3090,96 @@ ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s61 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s60 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s63 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s66 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s59 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s58 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s57 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s42 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s56 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s54 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s53 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 @@ -3237,121 +3234,113 @@ ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s39, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s38, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s36, s16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s37, s17 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[18:19], 0x40 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s69, s31, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s70, s30, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s15, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s67, s31, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s30, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s31, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s1, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s0, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s3, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s2, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s60, s1, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s61, s0, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s62, s3, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s63, s2, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s67, s29, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s28, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s29, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s28, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s29, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s36 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s27, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s26, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s27, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s26, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s27, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s25, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:240 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s25, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s68 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s24, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s24, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s25, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s24, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s23, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s23, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s22, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s22, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s23, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s22, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s21, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:208 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s21, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s20, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s20, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s21, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s20, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s19, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:192 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s19, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s18, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s18, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s18, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s17, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:176 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s17, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s16, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s16, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s15, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:160 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s14, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s13, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:144 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s13, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s12, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s12, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:128 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s11, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s10, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s9, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s8, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:96 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s7, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 @@ -3360,7 +3349,7 @@ ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s6, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 @@ -3369,31 +3358,37 @@ ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s3, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s1, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s0, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s62 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s61 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v64i16_to_v64i32: @@ -3636,17 +3631,16 @@ ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s20, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s21, s21 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s20, s20 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s22, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s62, s23 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s23, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s22, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s23, s23 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s22, s22 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s25, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s64, s24, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s25, s25 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s24, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s64, s25 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s24, s24 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s27, 16 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s26, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s27, s27 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s26, s26 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s29, 16 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s68, s28, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s29, s29 @@ -3655,7 +3649,8 @@ ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s70, s30, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s31, s31 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s30, s30 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s23, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s26, s26 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s25, 16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -3673,23 +3668,24 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s63 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s23 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 @@ -3813,110 +3809,110 @@ ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 ; GCN-HSA-NEXT: s_sext_i32_i16 s53, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 -; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] -; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] -; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 ; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 +; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 +; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 +; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 +; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 +; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 +; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] -; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s55 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s50 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s58 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s49 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s46 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 @@ -3971,161 +3967,161 @@ ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s39, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s38, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s36, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s37, s1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x40 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s49, s31, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s69, s15, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s14, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s51, s31, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s52, s30, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s31, s31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s15, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s68, s14, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s14 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s51, s1, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s52, s0, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s53, s1 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s54, s0 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s55, s3, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s56, s2, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s57, s3 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s58, s2 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s13, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s68, s12, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s13, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s66, s12, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s13, s13 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s12 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s36 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s11, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s66, s10, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s11, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s64, s10, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s9, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:240 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s61, s9, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s68 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s64, s8, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s62, s8, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s61, s7, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s59, s7, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s62, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s60, s6, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s59, s5, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:208 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s57, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s60, s4, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s58, s4, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s50, s30, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:192 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s55, s3, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s31, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s56, s2, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:176 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s53, s1, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s54, s0, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s1, s1 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s0, s0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:160 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s30, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s58 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s47, s29, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s48, s28, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:144 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s49, s29, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s50, s28, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s29, s29 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s28, s28 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s45, s27, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:128 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s47, s27, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s46, s26, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s48, s26, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s27, s27 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s26, s26 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s43, s25, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s45, s25, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s44, s24, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s46, s24, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s25, s25 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s24, s24 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s41, s23, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s43, s23, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s42, s22, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s44, s22, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s23, s23 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s22, s22 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s39, s21, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s41, s21, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s40, s20, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s42, s20, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s21, s21 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s20, s20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:64 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s35, s19, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s18, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s40, s18, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s19, s19 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s18 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s17, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s34, s16, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s17, s17 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v64i16_to_v64i32: @@ -5777,55 +5773,57 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s6, s15 -; GCN-HSA-NEXT: s_mov_b32 s16, s13 -; GCN-HSA-NEXT: s_mov_b32 s18, s11 -; GCN-HSA-NEXT: s_mov_b32 s20, s9 -; GCN-HSA-NEXT: s_lshr_b32 s22, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s24, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s8, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[14:15], s[14:15], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-HSA-NEXT: s_mov_b32 s16, s15 +; GCN-HSA-NEXT: s_mov_b32 s18, s13 +; GCN-HSA-NEXT: s_mov_b32 s20, s11 +; GCN-HSA-NEXT: s_mov_b32 s22, s9 +; GCN-HSA-NEXT: s_lshr_b32 s24, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s4, s8, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[10:11], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 ; GCN-HSA-NEXT: s_ashr_i64 s[8:9], s[8:9], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[12:13], s[12:13], 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[26:27], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[14:15], s[14:15], 48 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 ; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GCN-HSA-NEXT: s_add_u32 s10, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -5834,8 +5832,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x60 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -5843,8 +5841,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: s_add_u32 s8, s0, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -5852,25 +5850,25 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -6053,108 +6051,106 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s0, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s0, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s0, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s1, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: @@ -6196,43 +6192,43 @@ ; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff -; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s0, s15, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -6579,95 +6575,97 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s3 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s38, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s8, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s54, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s0, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s0, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[22:23], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[58:59], s[6:7], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[60:61], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[14:15], 0x100000 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[68:69], s[0:1], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[8:9], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[8:9], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[12:13], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[14:15], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[12:13], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[12:13], s[14:15], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[12:13], s[6:7], 48 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s49 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s55 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s52 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s56 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s57 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[46:47], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[44:45], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[30:31], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[28:29], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[46:47], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[38:39], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s40 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s41 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[58:59], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[54:55], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[52:53], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[28:29], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s72 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s73 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s71 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s68 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 @@ -6680,39 +6678,41 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s63 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s60 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s59 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s31 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s7 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: @@ -6722,21 +6722,21 @@ ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s42, s15 -; GCN-HSA-NEXT: s_mov_b32 s48, s13 -; GCN-HSA-NEXT: s_mov_b32 s50, s11 -; GCN-HSA-NEXT: s_mov_b32 s52, s9 -; GCN-HSA-NEXT: s_mov_b32 s54, s7 -; GCN-HSA-NEXT: s_mov_b32 s56, s5 -; GCN-HSA-NEXT: s_mov_b32 s44, s3 -; GCN-HSA-NEXT: s_mov_b32 s58, s1 -; GCN-HSA-NEXT: s_lshr_b32 s60, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s62, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s64, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s66, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s68, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s70, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s72, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s74, s0, 16 +; GCN-HSA-NEXT: s_mov_b32 s44, s13 +; GCN-HSA-NEXT: s_mov_b32 s46, s11 +; GCN-HSA-NEXT: s_mov_b32 s48, s9 +; GCN-HSA-NEXT: s_mov_b32 s50, s7 +; GCN-HSA-NEXT: s_mov_b32 s52, s5 +; GCN-HSA-NEXT: s_mov_b32 s54, s3 +; GCN-HSA-NEXT: s_mov_b32 s56, s1 +; GCN-HSA-NEXT: s_lshr_b32 s58, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s60, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s62, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s64, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s66, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s68, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s70, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s72, s0, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 @@ -6750,7 +6750,7 @@ ; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 ; GCN-HSA-NEXT: s_ashr_i64 s[40:41], s[4:5], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[46:47], s[6:7], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 @@ -6758,107 +6758,109 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[74:75], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[72:73], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[70:71], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[68:69], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[66:67], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[64:65], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[60:61], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[58:59], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[72:73], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[70:71], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[64:65], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[60:61], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[58:59], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[56:57], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s58, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s59, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 -; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s49 -; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49 -; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s46 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s59 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s47 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 +; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 +; GCN-HSA-NEXT: s_add_u32 s56, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s44 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s45 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s45 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s45 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s45 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s45 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 -; GCN-HSA-NEXT: s_add_u32 s40, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 -; GCN-HSA-NEXT: s_addc_u32 s41, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s41 -; GCN-HSA-NEXT: s_add_u32 s40, s16, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s41, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GCN-HSA-NEXT: s_add_u32 s38, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 -; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s39 -; GCN-HSA-NEXT: s_add_u32 s38, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 -; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 +; GCN-HSA-NEXT: s_add_u32 s36, s16, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15 -; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s74 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s75 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s44 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s38 +; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s39 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s37 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 ; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xc0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 ; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s38 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xa0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 @@ -6922,141 +6924,139 @@ ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s36, s15 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s38, s13 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[82:83], s[14:15], 48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s11 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s3 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s50, s1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s2, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s0, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[68:69], s[0:1], 48 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s17 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s38, s15 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s13 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s14, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[14:15], s[14:15], 48 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s82 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s83 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s9 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s11 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[82:83], s[12:13], 48 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s44, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s81 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s44, s9 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[80:81], s[10:11], 48 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s46, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s46, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s78 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s79 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s82 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s83 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[78:79], s[8:9], 48 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s76 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s77 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s81 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[76:77], s[6:7], 48 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s50, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s74 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s75 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s14, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s78 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s79 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[74:75], s[4:5], 48 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s52, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s73 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s76 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s77 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[72:73], s[2:3], 48 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s12, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[70:71], s[0:1], 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s74 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s75 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s10, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s12, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s68 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[54:55], 0x100000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s8, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s73 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[56:57], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s8, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[58:59], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s6, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[60:61], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s4, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x100000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s4, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[62:63], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s2, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x100000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[66:67], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[0:1], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[68:69], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -2144,17 +2144,17 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v1 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v3 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v5 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 @@ -2164,50 +2164,50 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[1:4] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[8:11] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i32: @@ -2675,94 +2675,94 @@ ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x70 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 64 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 32 -; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v10 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[3:6] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(4) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v12 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[16:19] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i32: @@ -3102,78 +3102,78 @@ ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v0 ; GCN-HSA-NEXT: v_bfe_i32 v18, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v16, v0, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v18, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v2, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v18, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v2, 0, 16 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: s_waitcnt vmcnt(4) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v6 ; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16 -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v18, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v4, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v9 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v8 ; GCN-HSA-NEXT: v_bfe_i32 v6, v9, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v8, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; GCN-HSA-NEXT: v_bfe_i32 v2, v11, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v10, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v12 ; GCN-HSA-NEXT: v_bfe_i32 v6, v13, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v12, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v14 ; GCN-HSA-NEXT: v_bfe_i32 v2, v15, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v14, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i32: @@ -3598,14 +3598,17 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s6, s2, 16 +; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 -; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s8, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 64 @@ -3614,15 +3617,12 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s8 -; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s4 ; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] +; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 @@ -3690,7 +3690,7 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[7:10] -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v1 @@ -3704,7 +3704,7 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v17 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v16 ; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v17 @@ -3729,13 +3729,12 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: s_waitcnt vmcnt(12) +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v33 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v32 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v33 ; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v25 @@ -6060,45 +6059,45 @@ ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v12 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i64: @@ -6257,22 +6256,22 @@ ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[10:11], v[0:1], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[14:15], v[2:3], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[14:15], v[0:1], 48 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[2:3], v[2:3], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v5, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v7, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v7, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -6285,40 +6284,40 @@ ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[0:1], 48 -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_ashr_i64 v[14:15], v[0:1], 48 +; GCN-HSA-NEXT: v_bfe_i32 v12, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[2:3], 48 -; GCN-HSA-NEXT: v_bfe_i32 v0, v11, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v10, v10, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: v_bfe_i32 v0, v7, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i64: @@ -6337,15 +6336,15 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v11, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v7, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v10, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 @@ -6480,65 +6479,65 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v21 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v8 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -6547,64 +6546,65 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v7 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v7 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[7:10] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64: @@ -6621,35 +6621,34 @@ ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, v4 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v6 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v4 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v4 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v4 @@ -6658,6 +6657,7 @@ ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 @@ -6857,51 +6857,49 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, v7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[10:11], v[6:7], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[9:10], v[4:5], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v15, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v12, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[10:11], v[2:3], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[11:12], v[0:1], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[14:15], v[0:1], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v10, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[2:3], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v1, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v14, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v11, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[25:26], v[6:7], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[26:27], v[4:5], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v5, 0, 16 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64: @@ -6930,66 +6928,66 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[0:1], 48 ; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[2:3], 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v10, v16, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: v_mov_b32_e32 v16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[4:5], 48 -; GCN-HSA-NEXT: v_bfe_i32 v0, v5, 0, 16 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GCN-HSA-NEXT: v_bfe_i32 v10, v8, 0, 16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v6, 0, 16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GCN-HSA-NEXT: v_ashr_i64 v[14:15], v[4:5], 48 +; GCN-HSA-NEXT: v_bfe_i32 v12, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v10, v9, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[6:7], 48 -; GCN-HSA-NEXT: v_bfe_i32 v4, v11, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 -; GCN-HSA-NEXT: v_bfe_i32 v2, v1, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_bfe_i32 v2, v17, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: v_bfe_i32 v4, v16, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i64: @@ -7008,55 +7006,55 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v11, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v13, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -7263,6 +7261,9 @@ ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[14:17], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v2 @@ -7280,14 +7281,13 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v22 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v7 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v9 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 @@ -7316,10 +7316,10 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v53, v1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v43, v1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v45, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v41, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, v1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v23 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v1 @@ -7332,8 +7332,8 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v59, v1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v47, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload @@ -7351,23 +7351,23 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, v12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, 0 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload @@ -7379,8 +7379,8 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -7388,153 +7388,158 @@ ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[9:12], v[9:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 +; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[13:16], v[13:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[17:20], v[17:18] -; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xf0 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xd0 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xb0 +; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x90 ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x90 +; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s16, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s17 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 +; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 +; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, v1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s10 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s14 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s12 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v11 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s10 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[5:8] -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[5:8] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s9 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v10 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[17:20] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[9:12] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v19 +; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v17 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[23:26] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v11 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[5:8] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v9 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, v1 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[15:18] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[5:8] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64: @@ -7549,93 +7554,93 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[30:33], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[34:37], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v57, 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[29:32], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[33:36], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v56, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v53, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v50, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v59, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v47, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v38, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v58, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, v56 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v30 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v32 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v33 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v33 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v29 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v37, 0xffff, v29 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v41, 0xffff, v31 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v32 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xffff, v32 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 16, v36 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, 0xffff, v36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v57 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v32, v56 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xffff, v6 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v49, 0xffff, v34 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v36, 16, v35 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v35 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v58, 16, v37 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, 0xffff, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, v57 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v57 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v33 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v33 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v36 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v55, 0xffff, v36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v34, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v56 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v56 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v44, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v56 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i16_to_v32i64: @@ -7981,105 +7986,101 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v15 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[14:15], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v15 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[2:3], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[12:13], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v3 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[0:1], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[2:3], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v21, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[6:7], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[0:1], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v7 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[4:5], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[6:7], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v22, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[10:11], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[4:5], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v11 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[8:9], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[10:11], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v23, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[14:15], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[8:9], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[12:13], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v14, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v16, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v2, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v12, 0, 16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v10 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v3, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v8, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v10, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v14, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v10, 0, 16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v21, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v24, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v4, 0, 16 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v2, 0, 16 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v27, v2, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64 @@ -8093,7 +8094,7 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -8101,9 +8102,9 @@ ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 @@ -8111,154 +8112,154 @@ ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[8:9], 48 -; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[4:5], 48 +; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xd0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xb0 +; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[6:7], 48 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x70 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x90 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x50 +; GCN-HSA-NEXT: v_bfe_i32 v18, v5, 0, 16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x70 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 32 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, v11 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x50 -; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[10:11], 48 -; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 32 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_bfe_i32 v18, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v10, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s14 +; GCN-HSA-NEXT: s_waitcnt vmcnt(5) +; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[0:1], 48 +; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s11 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s10 +; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[2:3], 48 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[16:19] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v10, v9, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 -; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[0:1], 48 -; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[2:3], 48 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[4:5], 48 -; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v7 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 -; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[6:7], 48 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s13 +; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[8:9], 48 +; GCN-HSA-NEXT: v_bfe_i32 v3, v9, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[3:6] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v11 +; GCN-HSA-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[10:11], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashr_i64 v[9:10], v[12:13], 48 -; GCN-HSA-NEXT: v_bfe_i32 v7, v13, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s12 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[12:13], 48 +; GCN-HSA-NEXT: v_bfe_i32 v3, v13, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[3:6] +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s11 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s10 -; GCN-HSA-NEXT: v_bfe_i32 v7, v3, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[9:10], v[14:15], 48 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[14:15], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[3:6] +; GCN-HSA-NEXT: v_bfe_i32 v19, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v23, v2, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v21, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v25, v25, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_bfe_i32 v7, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[23:26] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[1:2], v[7:10] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: v_bfe_i32 v15, v10, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v17, v18, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v11, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v17, v5, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_bfe_i32 v15, v6, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v14 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_bfe_i32 v13, v7, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v7, v8, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v9, v9, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v3, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[15:18] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; GCN-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GCN-HSA-NEXT: v_bfe_i32 v7, v14, 0, 16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v0, v12, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v11, v14, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; GCN-HSA-NEXT: v_bfe_i32 v3, v12, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_bfe_i32 v9, v9, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64: @@ -8271,95 +8272,95 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[12:13] -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[22:23], 48, v[0:1] +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[0:1] -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[22:23], 48, v[4:5] +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v19, 0, 16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[4:5] -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[22:23], 48, v[8:9] +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v7 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v5, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[8:9] -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[22:23], 48, v[12:13] +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v18, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[21:22], 48, v[2:3] +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v11 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[20:21], 48, v[6:7] +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v17, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[19:20], 48, v[10:11] +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v15 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v16, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[14:15] +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[17:18], 48, v[2:3] -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[17:18], 48, v[6:7] -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v10 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[17:18], 48, v[10:11] -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:224 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v3, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v5, v14, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v7, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v21, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v22, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:160 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:128 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64 diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir @@ -1,6 +1,14 @@ # RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler -amdgpu-disable-unclustred-high-rp-reschedule -verify-machineinstrs %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s # REQUIRES: asserts +--- | + define void @sink_and_inc_idx_when_skipping_small_region_1() "amdgpu-flat-work-group-size"="1,64" { + ret void + } + + define void @sink_and_inc_idx_when_skipping_small_regions_2() "amdgpu-flat-work-group-size"="1,64" { + ret void + } --- name: sink_and_inc_idx_when_skipping_small_region_1 tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -139,16 +139,16 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]] @@ -248,14 +248,14 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] @@ -356,15 +356,15 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_22]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] @@ -464,27 +464,27 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.4(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]] - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] @@ -600,29 +600,29 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_22]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.4(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]] - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_25]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] @@ -722,7 +722,6 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0 @@ -743,6 +742,8 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec ; GFX908-NEXT: undef %4.sub1:sreg_64 = S_MOV_B32 0 ; GFX908-NEXT: undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: {{ $}} @@ -758,8 +759,8 @@ ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000) @@ -772,8 +773,7 @@ ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.1(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: S_BRANCH %bb.1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: @@ -1114,6 +1114,14 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1 ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 2 @@ -1186,19 +1194,12 @@ ; GFX908-NEXT: [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 69 ; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 70 ; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 71 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 72 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 73 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 74 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 75 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 76 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 77 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 78 ; GFX908-NEXT: [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 79 ; GFX908-NEXT: [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 80 @@ -1215,14 +1216,13 @@ ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]] ; GFX908-NEXT: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc ; GFX908-NEXT: undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc ; GFX908-NEXT: S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc @@ -1643,6 +1643,10 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1 ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 2 @@ -1715,10 +1719,6 @@ ; GFX908-NEXT: [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 69 ; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 70 ; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 71 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 72 ; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 73 ; GFX908-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 74 @@ -1759,9 +1759,6 @@ ; GFX908-NEXT: S_BRANCH %bb.1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: - ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]] - ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]] - ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_4]], implicit [[S_MOV_B32_5]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_10]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_11]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_12]] @@ -1774,6 +1771,9 @@ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_19]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]] + ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]] + ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]] + ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_4]], implicit [[S_MOV_B32_5]] ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_6]], implicit [[S_MOV_B32_7]] ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_8]], implicit [[S_MOV_B32_9]] ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_10]], implicit [[S_MOV_B32_11]] @@ -2049,6 +2049,10 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1 ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 2 @@ -2121,13 +2125,9 @@ ; GFX908-NEXT: [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 69 ; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 70 ; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 71 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 72 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 73 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 74 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 75 ; GFX908-NEXT: [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 76 ; GFX908-NEXT: [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 77 @@ -2801,7 +2801,6 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0 @@ -2823,6 +2822,7 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec ; GFX908-NEXT: undef %4.sub1:sreg_64 = S_MOV_B32 0 ; GFX908-NEXT: undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: {{ $}} @@ -2988,6 +2988,7 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode ; GFX908-NEXT: undef %4.sub1:sreg_64 = S_MOV_B32 0 ; GFX908-NEXT: undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: {{ $}} @@ -3003,10 +3004,9 @@ ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]] - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000) @@ -4974,13 +4974,13 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: undef %21.sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode - ; GFX908-NEXT: %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: undef %21.sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode + ; GFX908-NEXT: %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) @@ -5192,13 +5192,13 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]] - ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: @@ -5297,6 +5297,7 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) @@ -5304,7 +5305,6 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]] - ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: @@ -5726,17 +5726,17 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: DBG_VALUE %23, 0, 0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: DBG_VALUE [[V_CVT_I32_F64_e32_23]], 0, 0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]] @@ -5836,17 +5836,17 @@ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def %22, 327689 /* reguse:SReg_1_with_sub0 */, [[V_CVT_I32_F64_e32_4]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]] diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.mir b/llvm/test/CodeGen/AMDGPU/memory_clause.mir --- a/llvm/test/CodeGen/AMDGPU/memory_clause.mir +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.mir @@ -261,9 +261,11 @@ # GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, implicit $exec # GCN-NEXT: dead %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, implicit $exec # GCN-NEXT: dead %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, implicit $exec +# GCN-NEXT: dead %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, implicit $exec +# GCN-NEXT: dead %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, implicit $exec # GCN-NEXT: KILL %0{{$}} -# GCN-NEXT: dead %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 80, 0, implicit $exec -# GCN-NEXT: dead %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 96, 0, implicit $exec +# GCN-NEXT: dead %9:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 128, 0, implicit $exec +# GCN-NEXT: dead %10:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 144, 0, implicit $exec # GCN-NEXT: KILL %1{{$}} --- @@ -278,8 +280,10 @@ %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, implicit $exec %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, implicit $exec %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, implicit $exec - %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 80, 0, implicit $exec - %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 96, 0, implicit $exec + %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, implicit $exec + %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, implicit $exec + %9:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 128, 0, implicit $exec + %10:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 144, 0, implicit $exec ... # GCN-LABEL: {{^}}name: image_clause{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -73,22 +73,22 @@ ; GFX9-NEXT: .LBB1_2: ; %bb23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0 +; GFX9-NEXT: v_add_u32_e32 v18, v9, v0 ; GFX9-NEXT: v_add_u32_e32 v12, v17, v0 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_madak_f32 v3, v3, v7, 0x3727c5ac ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_u32_u24_e32 v18, v3, v5 -; GFX9-NEXT: v_add_u32_e32 v19, v3, v16 -; GFX9-NEXT: v_add_u32_e32 v3, v9, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v18 -; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v19, v13 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v19, v15, v[3:4] +; GFX9-NEXT: v_mul_u32_u24_e32 v19, v3, v5 +; GFX9-NEXT: v_add_u32_e32 v20, v3, v16 +; GFX9-NEXT: v_sub_u32_e32 v3, v18, v19 +; GFX9-NEXT: v_sub_u32_e32 v12, v12, v19 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v20, v15, v[3:4] +; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v20, v13 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v12, v14 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v18, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[18:19], 2, v[3:4] -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_co_u32_e64 v18, s[6:7], v10, v18 ; GFX9-NEXT: v_addc_co_u32_e64 v19, s[6:7], v11, v19, s[6:7] ; GFX9-NEXT: global_load_dword v3, v[18:19], off diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll --- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll +++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll @@ -15,7 +15,7 @@ ; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 -define amdgpu_kernel void @max_occupancy() { +define amdgpu_kernel void @max_occupancy() #10 { ret void } @@ -52,7 +52,7 @@ ; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 -define amdgpu_kernel void @used_24_vgprs() { +define amdgpu_kernel void @used_24_vgprs() #10 { call void asm sideeffect "", "~{v23}" () ret void } @@ -63,7 +63,7 @@ ; GFX1010W32: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 -define amdgpu_kernel void @used_28_vgprs() { +define amdgpu_kernel void @used_28_vgprs() #10 { call void asm sideeffect "", "~{v27}" () ret void } @@ -74,7 +74,7 @@ ; GFX1010W32: ; Occupancy: 20 ; GFX1030W32: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 -define amdgpu_kernel void @used_32_vgprs() { +define amdgpu_kernel void @used_32_vgprs() #10 { call void asm sideeffect "", "~{v31}" () ret void } @@ -86,7 +86,7 @@ ; GFX1030W64: ; Occupancy: 12 ; GFX1030W32: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 -define amdgpu_kernel void @used_36_vgprs() { +define amdgpu_kernel void @used_36_vgprs() #10 { call void asm sideeffect "", "~{v35}" () ret void } @@ -97,7 +97,7 @@ ; GFX1010W32: ; Occupancy: 20 ; GFX1030W32: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 -define amdgpu_kernel void @used_40_vgprs() { +define amdgpu_kernel void @used_40_vgprs() #10 { call void asm sideeffect "", "~{v39}" () ret void } @@ -109,7 +109,7 @@ ; GFX1030W64: ; Occupancy: 10 ; GFX1030W32: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 -define amdgpu_kernel void @used_44_vgprs() { +define amdgpu_kernel void @used_44_vgprs() #10 { call void asm sideeffect "", "~{v43}" () ret void } @@ -120,7 +120,7 @@ ; GFX1010W32: ; Occupancy: 20 ; GFX1030W32: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 -define amdgpu_kernel void @used_48_vgprs() { +define amdgpu_kernel void @used_48_vgprs() #10 { call void asm sideeffect "", "~{v47}" () ret void } @@ -132,7 +132,7 @@ ; GFX1030W32: ; Occupancy: 16 ; GFX1100W64: ; Occupancy: 12 ; GFX1100W32: ; Occupancy: 16 -define amdgpu_kernel void @used_56_vgprs() { +define amdgpu_kernel void @used_56_vgprs() #10 { call void asm sideeffect "", "~{v55}" () ret void } @@ -143,7 +143,7 @@ ; GFX10W32: ; Occupancy: 16 ; GFX1100W64: ; Occupancy: 10 ; GFX1100W32: ; Occupancy: 16 -define amdgpu_kernel void @used_64_vgprs() { +define amdgpu_kernel void @used_64_vgprs() #10 { call void asm sideeffect "", "~{v63}" () ret void } @@ -155,7 +155,7 @@ ; GFX1030W32: ; Occupancy: 12 ; GFX1100W64: ; Occupancy: 10 ; GFX1100W32: ; Occupancy: 16 -define amdgpu_kernel void @used_72_vgprs() { +define amdgpu_kernel void @used_72_vgprs() #10 { call void asm sideeffect "", "~{v71}" () ret void } @@ -166,7 +166,7 @@ ; GFX10W32: ; Occupancy: 12 ; GFX1100W64: ; Occupancy: 9 ; GFX1100W32: ; Occupancy: 16 -define amdgpu_kernel void @used_80_vgprs() { +define amdgpu_kernel void @used_80_vgprs() #10 { call void asm sideeffect "", "~{v79}" () ret void } @@ -179,7 +179,7 @@ ; GFX1030W32: ; Occupancy: 10 ; GFX1100W64: ; Occupancy: 9 ; GFX1100W32: ; Occupancy: 16 -define amdgpu_kernel void @used_84_vgprs() { +define amdgpu_kernel void @used_84_vgprs() #10 { call void asm sideeffect "", "~{v83}" () ret void } @@ -191,7 +191,7 @@ ; GFX1030W32: ; Occupancy: 10 ; GFX1100W64: ; Occupancy: 8 ; GFX1100W32: ; Occupancy: 16 -define amdgpu_kernel void @used_88_vgprs() { +define amdgpu_kernel void @used_88_vgprs() #10 { call void asm sideeffect "", "~{v87}" () ret void } @@ -202,7 +202,7 @@ ; GFX10W32: ; Occupancy: 10 ; GFX1100W64: ; Occupancy: 8 ; GFX1100W32: ; Occupancy: 16 -define amdgpu_kernel void @used_96_vgprs() { +define amdgpu_kernel void @used_96_vgprs() #10 { call void asm sideeffect "", "~{v95}" () ret void } @@ -214,7 +214,7 @@ ; GFX10W32: ; Occupancy: 9 ; GFX1100W64: ; Occupancy: 7 ; GFX1100W32: ; Occupancy: 12 -define amdgpu_kernel void @used_100_vgprs() { +define amdgpu_kernel void @used_100_vgprs() #10 { call void asm sideeffect "", "~{v99}" () ret void } @@ -225,7 +225,7 @@ ; GFX10W32: ; Occupancy: 9 ; GFX1100W64: ; Occupancy: 6 ; GFX1100W32: ; Occupancy: 12 -define amdgpu_kernel void @used_112_vgprs() { +define amdgpu_kernel void @used_112_vgprs() #10 { call void asm sideeffect "", "~{v111}" () ret void } @@ -236,7 +236,7 @@ ; GFX10W32: ; Occupancy: 8 ; GFX1100W64: ; Occupancy: 5 ; GFX1100W32: ; Occupancy: 10 -define amdgpu_kernel void @used_128_vgprs() { +define amdgpu_kernel void @used_128_vgprs() #10 { call void asm sideeffect "", "~{v127}" () ret void } @@ -247,7 +247,7 @@ ; GFX10W32: ; Occupancy: 7 ; GFX1100W64: ; Occupancy: 5 ; GFX1100W32: ; Occupancy: 10 -define amdgpu_kernel void @used_144_vgprs() { +define amdgpu_kernel void @used_144_vgprs() #10 { call void asm sideeffect "", "~{v143}" () ret void } @@ -259,7 +259,7 @@ ; GFX1030W32: ; Occupancy: 5 ; GFX1100W64: ; Occupancy: 4 ; GFX1100W32: ; Occupancy: 9 -define amdgpu_kernel void @used_168_vgprs() { +define amdgpu_kernel void @used_168_vgprs() #10 { call void asm sideeffect "", "~{v167}" () ret void } @@ -271,7 +271,7 @@ ; GFX1030W32: ; Occupancy: 4 ; GFX1100W64: ; Occupancy: 3 ; GFX1100W32: ; Occupancy: 7 -define amdgpu_kernel void @used_200_vgprs() { +define amdgpu_kernel void @used_200_vgprs() #10 { call void asm sideeffect "", "~{v199}" () ret void } @@ -282,7 +282,7 @@ ; GFX10W32: ; Occupancy: 4 ; GFX1100W64: ; Occupancy: 2 ; GFX1100W32: ; Occupancy: 5 -define amdgpu_kernel void @used_256_vgprs() { +define amdgpu_kernel void @used_256_vgprs() #10 { call void asm sideeffect "", "~{v255}" () ret void } @@ -292,7 +292,7 @@ ; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 -define amdgpu_kernel void @used_80_sgprs() { +define amdgpu_kernel void @used_80_sgprs() #10 { call void asm sideeffect "", "~{s79}" () ret void } @@ -302,7 +302,7 @@ ; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 -define amdgpu_kernel void @used_88_sgprs() { +define amdgpu_kernel void @used_88_sgprs() #10 { call void asm sideeffect "", "~{s87}" () ret void } @@ -312,7 +312,7 @@ ; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 -define amdgpu_kernel void @used_100_sgprs() { +define amdgpu_kernel void @used_100_sgprs() #10 { call void asm sideeffect "", "~{s99}" () ret void } @@ -322,15 +322,16 @@ ; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 -define amdgpu_kernel void @used_101_sgprs() { +define amdgpu_kernel void @used_101_sgprs() #10 { call void asm sideeffect "", "~{s100}" () ret void } ; GCN-LABEL: {{^}}used_lds_6552: -; GFX9: ; Occupancy: 10 -; GFX1010: ; Occupancy: 20 -; GFX1030: ; Occupancy: 16 +; GFX9: ; Occupancy: 8 +; GFX1010W64: ; Occupancy: 20 +; GFX1030W64: ; Occupancy: 16 +; GFX10W32: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 @lds6552 = internal addrspace(3) global [6552 x i8] undef, align 4 define amdgpu_kernel void @used_lds_6552() { @@ -339,9 +340,10 @@ } ; GCN-LABEL: {{^}}used_lds_6556: -; GFX9: ; Occupancy: 10 -; GFX1010: ; Occupancy: 20 -; GFX1030: ; Occupancy: 16 +; GFX9: ; Occupancy: 8 +; GFX1010W64: ; Occupancy: 20 +; GFX1030W64: ; Occupancy: 16 +; GFX10W32: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 @lds6556 = internal addrspace(3) global [6556 x i8] undef, align 4 define amdgpu_kernel void @used_lds_6556() { @@ -350,9 +352,10 @@ } ; GCN-LABEL: {{^}}used_lds_13112: -; GFX9: ; Occupancy: 10 -; GFX1010: ; Occupancy: 20 -; GFX1030: ; Occupancy: 16 +; GFX9: ; Occupancy: 8 +; GFX1010W64: ; Occupancy: 20 +; GFX1030W64: ; Occupancy: 16 +; GFX10W32: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 @lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4 define amdgpu_kernel void @used_lds_13112() { @@ -361,11 +364,11 @@ } ; GCN-LABEL: {{^}}used_lds_8252_max_group_size_64: -; GFX9: ; Occupancy: 7{{$}} -; GFX10W64: ; Occupancy: 7{{$}} -; GFX10W32: ; Occupancy: 14{{$}} -; GFX1100W64: ; Occupancy: 7{{$}} -; GFX1100W32: ; Occupancy: 14{{$}} +; GFX9: ; Occupancy: 2{{$}} +; GFX10W64: ; Occupancy: 4{{$}} +; GFX10W32: ; Occupancy: 8{{$}} +; GFX1100W64: ; Occupancy: 4{{$}} +; GFX1100W32: ; Occupancy: 8{{$}} @lds8252 = internal addrspace(3) global [8252 x i8] undef, align 4 define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 { store volatile i8 1, ptr addrspace(3) @lds8252 @@ -373,44 +376,46 @@ } ; GCN-LABEL: {{^}}used_lds_8252_max_group_size_96: -; GFX9: ; Occupancy: 10{{$}} -; GFX10W64: ; Occupancy: 14{{$}} -; GFX1010W32: ; Occupancy: 20{{$}} -; GFX1030W32: ; Occupancy: 16{{$}} -; GFX1100W64: ; Occupancy: 14{{$}} -; GFX1100W32: ; Occupancy: 16{{$}} +; GFX9: ; Occupancy: 4{{$}} +; GFX10W64: ; Occupancy: 8{{$}} +; GFX10W32: ; Occupancy: 12{{$}} +; GFX1100W64: ; Occupancy: 8{{$}} +; GFX1100W32: ; Occupancy: 12{{$}} define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 { store volatile i8 1, ptr addrspace(3) @lds8252 ret void } ; GCN-LABEL: {{^}}used_lds_8252_max_group_size_128: -; GFX9: ; Occupancy: 10{{$}} -; GFX10W64: ; Occupancy: 14{{$}} -; GFX1010W32: ; Occupancy: 20{{$}} -; GFX1030W32: ; Occupancy: 16{{$}} -; GFX1100W64: ; Occupancy: 14{{$}} -; GFX1100W32: ; Occupancy: 16{{$}} +; GFX9: ; Occupancy: 4{{$}} +; GFX10W64: ; Occupancy: 8{{$}} +; GFX10W32: ; Occupancy: 15{{$}} +; GFX1100W64: ; Occupancy: 8{{$}} +; GFX1100W32: ; Occupancy: 15{{$}} define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 { store volatile i8 1, ptr addrspace(3) @lds8252 ret void } ; GCN-LABEL: {{^}}used_lds_8252_max_group_size_192: -; GFX9: ; Occupancy: 10{{$}} -; GFX1010: ; Occupancy: 20{{$}} -; GFX1030: ; Occupancy: 16{{$}} -; GFX1100: ; Occupancy: 16{{$}} +; GFX9: ; Occupancy: 6{{$}} +; GFX10W64: ; Occupancy: 12{{$}} +; GFX1010W32: ; Occupancy: 20{{$}} +; GFX1030W32: ; Occupancy: 15{{$}} +; GFX1100W64: ; Occupancy: 12{{$}} +; GFX1100W32: ; Occupancy: 15{{$}} define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 { store volatile i8 1, ptr addrspace(3) @lds8252 ret void } ; GCN-LABEL: {{^}}used_lds_8252_max_group_size_256: -; GFX9: ; Occupancy: 10{{$}} -; GFX1010: ; Occupancy: 20{{$}} -; GFX1030: ; Occupancy: 16{{$}} -; GFX1100: ; Occupancy: 16{{$}} +; GFX9: ; Occupancy: 7{{$}} +; GFX10W64: ; Occupancy: 15{{$}} +; GFX1010W32: ; Occupancy: 20{{$}} +; GFX1030W32: ; Occupancy: 16{{$}} +; GFX1100W64: ; Occupancy: 15{{$}} +; GFX1100W32: ; Occupancy: 16{{$}} define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 { store volatile i8 1, ptr addrspace(3) @lds8252 ret void @@ -427,8 +432,9 @@ } ; GCN-LABEL: {{^}}used_lds_8252_max_group_size_1024: -; GFX9: ; Occupancy: 10{{$}} -; GFX1010: ; Occupancy: 20{{$}} +; GFX9: ; Occupancy: 8{{$}} +; GFX1010W32: ; Occupancy: 16{{$}} +; GFX1010W64: ; Occupancy: 20{{$}} ; GFX1030: ; Occupancy: 16{{$}} ; GFX1100: ; Occupancy: 16{{$}} define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 { @@ -437,17 +443,17 @@ } ; GCN-LABEL: {{^}}used_lds_8252_max_group_size_32: -; GFX9: ; Occupancy: 7{{$}} -; GFX10: ; Occupancy: 7{{$}} -; GFX1100: ; Occupancy: 7{{$}} +; GFX9: ; Occupancy: 2{{$}} +; GFX10: ; Occupancy: 4{{$}} +; GFX1100: ; Occupancy: 4{{$}} define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 { store volatile i8 1, ptr addrspace(3) @lds8252 ret void } attributes #0 = { "amdgpu-waves-per-eu"="2,3" "amdgpu-flat-work-group-size"="1,64" } -attributes #1 = { "amdgpu-waves-per-eu"="18,18" } -attributes #2 = { "amdgpu-waves-per-eu"="19,19" } +attributes #1 = { "amdgpu-waves-per-eu"="18,18" "amdgpu-flat-work-group-size"="1,32" } +attributes #2 = { "amdgpu-waves-per-eu"="19,19" "amdgpu-flat-work-group-size"="1,32" } attributes #3 = { "amdgpu-flat-work-group-size"="1,64" } attributes #4 = { "amdgpu-flat-work-group-size"="1,96" } attributes #5 = { "amdgpu-flat-work-group-size"="1,128" } diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -5,7 +5,7 @@ # GCN-LABEL: name: global_sextload_v32i32_to_v32i64 # GCN: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) -# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr20, killed renamable $vgpr24_vgpr25_vgpr26_vgpr27, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr0 +# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr20, killed renamable $vgpr27_vgpr28_vgpr29_vgpr30, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr0 --- name: global_sextload_v32i32_to_v32i64 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -447,110 +447,101 @@ ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v3, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x5000 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v6, 0 -; GFX8-NEXT: s_movk_i32 s4, 0x7f +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: s_movk_i32 s0, 0x7f ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_mov_b32 s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffb000, v3 -; GFX8-NEXT: s_mov_b64 s[0:1], vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v3 -; GFX8-NEXT: s_mov_b64 s[2:3], vcc -; GFX8-NEXT: v_addc_u32_e64 v8, vcc, -1, v4, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffb000, v5 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v5 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v5 ; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v3 -; GFX8-NEXT: s_mov_b64 s[0:1], vcc -; GFX8-NEXT: v_addc_u32_e64 v10, vcc, -1, v4, s[2:3] ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xffffc800, v3 -; GFX8-NEXT: s_mov_b64 s[2:3], vcc -; GFX8-NEXT: v_addc_u32_e64 v12, vcc, -1, v4, s[0:1] -; GFX8-NEXT: s_addk_i32 s5, 0x2000 -; GFX8-NEXT: s_cmp_gt_u32 s5, 0x3fffff -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v7, v5 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[11:12] -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xffffd000, v3 -; GFX8-NEXT: s_mov_b64 s[0:1], vcc -; GFX8-NEXT: v_addc_u32_e64 v14, vcc, -1, v4, s[2:3] -; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[13:14] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v9, v15 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, v10, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffd800, v3 -; GFX8-NEXT: s_mov_b64 s[2:3], vcc -; GFX8-NEXT: v_addc_u32_e64 v6, vcc, -1, v4, s[0:1] -; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v7, v15 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffe000, v3 -; GFX8-NEXT: s_mov_b64 s[0:1], vcc -; GFX8-NEXT: v_addc_u32_e64 v10, vcc, -1, v4, s[2:3] -; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v11, v13 -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, v12, v8, vcc -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffe800, v3 -; GFX8-NEXT: s_mov_b64 s[2:3], vcc -; GFX8-NEXT: v_addc_u32_e64 v8, vcc, -1, v4, s[0:1] -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v5, v13 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v12, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xfffff000, v3 -; GFX8-NEXT: s_mov_b64 s[0:1], vcc -; GFX8-NEXT: v_addc_u32_e64 v12, vcc, -1, v4, s[2:3] +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xffffc800, v5 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xffffd000, v5 ; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[11:12] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v9, v13 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v10, v6, vcc -; GFX8-NEXT: v_addc_u32_e64 v6, s[0:1], -1, v4, s[0:1] -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xfffff800, v3 -; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v4, vcc -; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v7, v13 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v8, v14, vcc -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[3:4] -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x10000, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffd800, v5 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, -1, v6, vcc +; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] +; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] +; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffe000, v5 +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe800, v5 +; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[19:20] +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v6, vcc +; GFX8-NEXT: flat_load_dwordx2 v[21:22], v[21:22] +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xfffff000, v5 +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, -1, v6, vcc +; GFX8-NEXT: flat_load_dwordx2 v[23:24], v[23:24] +; GFX8-NEXT: v_add_u32_e32 v25, vcc, 0xfffff800, v5 +; GFX8-NEXT: v_addc_u32_e32 v26, vcc, -1, v6, vcc +; GFX8-NEXT: flat_load_dwordx2 v[25:26], v[25:26] +; GFX8-NEXT: flat_load_dwordx2 v[27:28], v[5:6] +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x10000, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GFX8-NEXT: s_addk_i32 s1, 0x2000 +; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff +; GFX8-NEXT: s_waitcnt vmcnt(10) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc +; GFX8-NEXT: s_waitcnt vmcnt(9) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v9, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v11, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc +; GFX8-NEXT: s_waitcnt vmcnt(7) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v13, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v15, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v16, v4, vcc +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v17, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v18, v4, vcc +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v19, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v20, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v13 -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, v12, v14, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v21, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v22, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v11 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v12, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v23, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v24, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v9, v5 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v10, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v25, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v26, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v27, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v28, v4, vcc ; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX8-NEXT: s_add_i32 s0, s4, -1 -; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_add_i32 s1, s0, -1 +; GFX8-NEXT: s_cmp_eq_u32 s0, 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX8-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s0, s1 ; GFX8-NEXT: s_branch .LBB1_1 ; GFX8-NEXT: .LBB1_5: ; %while.end ; GFX8-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[5:6] +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[3:4] ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: clmem_read: @@ -584,94 +575,91 @@ ; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v3, vcc ; GFX900-NEXT: s_movk_i32 s0, 0x5000 ; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX900-NEXT: s_movk_i32 s4, 0x7f -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: s_movk_i32 s2, 0xd000 -; GFX900-NEXT: s_movk_i32 s3, 0xe000 -; GFX900-NEXT: s_movk_i32 s5, 0xf000 +; GFX900-NEXT: s_movk_i32 s2, 0x7f +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: s_movk_i32 s0, 0xd000 +; GFX900-NEXT: s_movk_i32 s1, 0xe000 +; GFX900-NEXT: s_movk_i32 s3, 0xf000 ; GFX900-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX900-NEXT: ; =>This Loop Header: Depth=1 ; GFX900-NEXT: ; Child Loop BB1_2 Depth 2 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: s_mov_b32 s6, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: s_mov_b32 s4, 0 ; GFX900-NEXT: .LBB1_2: ; %for.body ; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v3 -; GFX900-NEXT: s_mov_b64 s[0:1], vcc -; GFX900-NEXT: v_addc_co_u32_e64 v8, s[0:1], -1, v4, s[0:1] -; GFX900-NEXT: global_load_dwordx2 v[9:10], v[3:4], off offset:-4096 -; GFX900-NEXT: global_load_dwordx2 v[11:12], v[3:4], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v3 +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v6, vcc +; GFX900-NEXT: global_load_dwordx2 v[9:10], v[5:6], off offset:-4096 +; GFX900-NEXT: global_load_dwordx2 v[11:12], v[5:6], off offset:-2048 +; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v5 ; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v4, vcc -; GFX900-NEXT: s_addk_i32 s6, 0x2000 -; GFX900-NEXT: s_cmp_gt_u32 s6, 0x3fffff -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v7, v5 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v6, vcc -; GFX900-NEXT: global_load_dwordx2 v[5:6], v[13:14], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, v5, v7 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v6, v8, vcc -; GFX900-NEXT: global_load_dwordx2 v[6:7], v[13:14], off -; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, s2, v3 -; GFX900-NEXT: s_mov_b64 s[0:1], vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, v6, v15 -; GFX900-NEXT: v_addc_co_u32_e64 v6, s[0:1], -1, v4, s[0:1] -; GFX900-NEXT: global_load_dwordx2 v[5:6], v[5:6], off offset:-2048 -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, v7, v8, vcc -; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, s3, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v4, vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, v5, v13 -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, v6, v14, vcc -; GFX900-NEXT: global_load_dwordx2 v[5:6], v[7:8], off offset:-4096 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, v5, v13 -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, v6, v14, vcc -; GFX900-NEXT: global_load_dwordx2 v[5:6], v[7:8], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, v5, v13 -; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, v6, v14, vcc -; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, s5, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v4, vcc -; GFX900-NEXT: global_load_dwordx2 v[5:6], v[5:6], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, v7, v13 -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, v8, v14, vcc -; GFX900-NEXT: global_load_dwordx2 v[7:8], v[3:4], off -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, 0x10000, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v6, vcc +; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 +; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s0, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v6, vcc +; GFX900-NEXT: global_load_dwordx2 v[15:16], v[15:16], off offset:-2048 +; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, s1, v5 +; GFX900-NEXT: global_load_dwordx2 v[13:14], v[13:14], off +; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v6, vcc +; GFX900-NEXT: global_load_dwordx2 v[23:24], v[19:20], off offset:-4096 +; GFX900-NEXT: global_load_dwordx2 v[25:26], v[19:20], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[27:28], v[19:20], off +; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, s3, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v6, vcc +; GFX900-NEXT: global_load_dwordx2 v[19:20], v[21:22], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[29:30], v[5:6], off +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, 0x10000, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX900-NEXT: s_addk_i32 s4, 0x2000 +; GFX900-NEXT: s_cmp_gt_u32 s4, 0x3fffff +; GFX900-NEXT: s_waitcnt vmcnt(8) +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX900-NEXT: s_waitcnt vmcnt(7) +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v17, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v18, v4, vcc +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v13, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v15, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v16, v4, vcc +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v23, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v24, v4, vcc +; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v25, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v26, v4, vcc +; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v27, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v28, v4, vcc ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v5, v13 -; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v14, vcc -; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v9, v5 -; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v10, v6, vcc -; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v11, v5 -; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v12, v6, vcc +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v19, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v20, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v9, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v12, v4, vcc ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 -; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v29, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v30, v4, vcc ; GFX900-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX900-NEXT: s_add_i32 s0, s4, -1 -; GFX900-NEXT: s_cmp_eq_u32 s4, 0 +; GFX900-NEXT: s_add_i32 s4, s2, -1 +; GFX900-NEXT: s_cmp_eq_u32 s2, 0 ; GFX900-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX900-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX900-NEXT: s_mov_b32 s4, s0 +; GFX900-NEXT: s_mov_b32 s2, s4 ; GFX900-NEXT: s_branch .LBB1_1 ; GFX900-NEXT: .LBB1_5: ; %while.end ; GFX900-NEXT: v_mov_b32_e32 v1, s35 ; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0 ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX900-NEXT: global_store_dwordx2 v[0:1], v[5:6], off +; GFX900-NEXT: global_store_dwordx2 v[0:1], v[3:4], off ; GFX900-NEXT: s_endpgm ; ; GFX10-LABEL: clmem_read: diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll --- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll +++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll @@ -124,7 +124,7 @@ ; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 0 ; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0 ; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0 -; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: 10 +; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: 8 ; STDERR-NEXT: remark: foo.cl:8:0: SGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:8:0: VGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:8:0: LDS Size [bytes/block]: 0 diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir --- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir @@ -42,7 +42,7 @@ ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def dead %11 ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, implicit $exec :: (store (s32), addrspace 1) ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]] ; CHECK-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3) ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def %15, 851978 /* regdef:VGPR_LO16 */, def %16 ; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec @@ -50,8 +50,8 @@ ; CHECK-NEXT: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def %21, 851978 /* regdef:VGPR_LO16 */, def %22 ; CHECK-NEXT: [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def dead [[V_MOV_B32_e32_2]], 851978 /* regdef:VGPR_LO16 */, def dead [[V_MOV_B32_e32_3]], 851977 /* reguse:VGPR_LO16 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_2]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_3]](tied-def 5), 851977 /* reguse:VGPR_LO16 */, %15, 851977 /* reguse:VGPR_LO16 */, %16, 851977 /* reguse:VGPR_LO16 */, [[DS_READ_B32_gfx9_1]], 851977 /* reguse:VGPR_LO16 */, [[DS_READ_B32_gfx9_]], 851977 /* reguse:VGPR_LO16 */, [[DS_READ_B32_gfx9_3]], 851977 /* reguse:VGPR_LO16 */, [[DS_READ_B32_gfx9_2]] - ; CHECK-NEXT: %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]] ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3) ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3) ; CHECK-NEXT: DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store (s64), addrspace 3) diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir --- a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir @@ -16,18 +16,18 @@ ; CHECK-NEXT: undef %0.sub3:vreg_128 = COPY $vgpr9 ; CHECK-NEXT: undef %1.sub2:vreg_128 = COPY $vgpr8 ; CHECK-NEXT: undef %2.sub1:vreg_128 = COPY $vgpr7 - ; CHECK-NEXT: undef %8.sub1:vreg_64 = COPY $vgpr1 - ; CHECK-NEXT: %8.sub0:vreg_64 = COPY $vgpr0 ; CHECK-NEXT: undef %3.sub0:vreg_128 = COPY $vgpr6 ; CHECK-NEXT: undef %4.sub3:vreg_128 = COPY $vgpr5 ; CHECK-NEXT: undef %5.sub2:vreg_128 = COPY $vgpr4 + ; CHECK-NEXT: undef %8.sub1:vreg_64 = COPY $vgpr1 + ; CHECK-NEXT: %8.sub0:vreg_64 = COPY $vgpr0 ; CHECK-NEXT: undef %6.sub1:vreg_128 = COPY $vgpr3 ; CHECK-NEXT: undef %7.sub0:vreg_128 = COPY $vgpr2 ; CHECK-NEXT: undef %9.sub0:sgpr_128 = V_READFIRSTLANE_B32 %7.sub0, implicit $exec ; CHECK-NEXT: %9.sub1:sgpr_128 = V_READFIRSTLANE_B32 %6.sub1, implicit $exec + ; CHECK-NEXT: S_BARRIER ; CHECK-NEXT: %9.sub2:sgpr_128 = V_READFIRSTLANE_B32 %5.sub2, implicit $exec ; CHECK-NEXT: %9.sub3:sgpr_128 = V_READFIRSTLANE_B32 %4.sub3, implicit $exec - ; CHECK-NEXT: S_BARRIER ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %9, 0, 0, 0, 0, implicit $exec ; CHECK-NEXT: undef %12.sub0:sgpr_128 = V_READFIRSTLANE_B32 %3.sub0, implicit $exec ; CHECK-NEXT: %12.sub1:sgpr_128 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll @@ -3,8 +3,7 @@ ; Provide a long sequence of 32 vec4 load/store pairs that ought to be fully ; overlapped for latency hiding. Doing so requires using (at least) 128 VGPRs, -; which currently looks to the scheduler like an occupancy reduction, even -; though it's not. TODO: Fix! +; which (incorrectly) used to look to the scheduler like an occupancy reduction. ; 6 kB of LDS, allows 10 workgroups @lds = internal addrspace(3) global [384 x <4 x i32>] undef @@ -20,7 +19,7 @@ define amdgpu_cs void @test(ptr addrspace(1) %src) "amdgpu-flat-work-group-size"="32,32" { ; CHECK-LABEL: test: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_clause 0xa +; CHECK-NEXT: s_clause 0x1f ; CHECK-NEXT: global_load_b128 v[2:5], v[0:1], off ; CHECK-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16 ; CHECK-NEXT: global_load_b128 v[10:13], v[0:1], off offset:32 @@ -32,81 +31,92 @@ ; CHECK-NEXT: global_load_b128 v[34:37], v[0:1], off offset:128 ; CHECK-NEXT: global_load_b128 v[38:41], v[0:1], off offset:144 ; CHECK-NEXT: global_load_b128 v[42:45], v[0:1], off offset:160 -; CHECK-NEXT: v_mov_b32_e32 v86, 0 -; CHECK-NEXT: s_clause 0x8 ; CHECK-NEXT: global_load_b128 v[46:49], v[0:1], off offset:176 -; CHECK-NEXT: global_load_b128 v[50:53], v[0:1], off offset:240 -; CHECK-NEXT: global_load_b128 v[54:57], v[0:1], off offset:224 -; CHECK-NEXT: global_load_b128 v[58:61], v[0:1], off offset:208 -; CHECK-NEXT: global_load_b128 v[62:65], v[0:1], off offset:192 -; CHECK-NEXT: global_load_b128 v[66:69], v[0:1], off offset:304 -; CHECK-NEXT: global_load_b128 v[70:73], v[0:1], off offset:288 -; CHECK-NEXT: global_load_b128 v[74:77], v[0:1], off offset:272 -; CHECK-NEXT: global_load_b128 v[78:81], v[0:1], off offset:256 +; CHECK-NEXT: global_load_b128 v[50:53], v[0:1], off offset:192 +; CHECK-NEXT: global_load_b128 v[54:57], v[0:1], off offset:208 +; CHECK-NEXT: global_load_b128 v[58:61], v[0:1], off offset:224 +; CHECK-NEXT: global_load_b128 v[62:65], v[0:1], off offset:240 +; CHECK-NEXT: global_load_b128 v[66:69], v[0:1], off offset:256 +; CHECK-NEXT: global_load_b128 v[70:73], v[0:1], off offset:272 +; CHECK-NEXT: global_load_b128 v[74:77], v[0:1], off offset:288 +; CHECK-NEXT: global_load_b128 v[78:81], v[0:1], off offset:304 +; CHECK-NEXT: global_load_b128 v[82:85], v[0:1], off offset:320 +; CHECK-NEXT: global_load_b128 v[86:89], v[0:1], off offset:336 +; CHECK-NEXT: global_load_b128 v[90:93], v[0:1], off offset:352 +; CHECK-NEXT: global_load_b128 v[94:97], v[0:1], off offset:368 +; CHECK-NEXT: global_load_b128 v[98:101], v[0:1], off offset:384 +; CHECK-NEXT: global_load_b128 v[102:105], v[0:1], off offset:400 +; CHECK-NEXT: global_load_b128 v[106:109], v[0:1], off offset:416 +; CHECK-NEXT: global_load_b128 v[110:113], v[0:1], off offset:432 +; CHECK-NEXT: global_load_b128 v[114:117], v[0:1], off offset:448 +; CHECK-NEXT: global_load_b128 v[118:121], v[0:1], off offset:464 +; CHECK-NEXT: global_load_b128 v[122:125], v[0:1], off offset:480 +; CHECK-NEXT: global_load_b128 v[126:129], v[0:1], off offset:496 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt vmcnt(31) +; CHECK-NEXT: ds_store_b128 v0, v[2:5] +; CHECK-NEXT: s_waitcnt vmcnt(30) +; CHECK-NEXT: ds_store_b128 v0, v[6:9] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(29) +; CHECK-NEXT: ds_store_b128 v0, v[10:13] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(28) +; CHECK-NEXT: ds_store_b128 v0, v[14:17] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(27) +; CHECK-NEXT: ds_store_b128 v0, v[18:21] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: ds_store_b128 v0, v[22:25] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(25) +; CHECK-NEXT: ds_store_b128 v0, v[26:29] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(24) +; CHECK-NEXT: ds_store_b128 v0, v[30:33] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(23) +; CHECK-NEXT: ds_store_b128 v0, v[34:37] offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(22) +; CHECK-NEXT: ds_store_b128 v0, v[38:41] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: ds_store_b128 v0, v[42:45] offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: ds_store_b128 v0, v[46:49] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: ds_store_b128 v86, v[2:5] +; CHECK-NEXT: ds_store_b128 v0, v[50:53] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: ds_store_b128 v86, v[6:9] offset:16 +; CHECK-NEXT: ds_store_b128 v0, v[54:57] offset:208 ; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: ds_store_b128 v86, v[10:13] offset:32 +; CHECK-NEXT: ds_store_b128 v0, v[58:61] offset:224 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: ds_store_b128 v86, v[14:17] offset:48 +; CHECK-NEXT: ds_store_b128 v0, v[62:65] offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: ds_store_b128 v86, v[18:21] offset:64 +; CHECK-NEXT: ds_store_b128 v0, v[66:69] offset:256 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: ds_store_b128 v86, v[22:25] offset:80 +; CHECK-NEXT: ds_store_b128 v0, v[70:73] offset:272 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: ds_store_b128 v86, v[26:29] offset:96 +; CHECK-NEXT: ds_store_b128 v0, v[74:77] offset:288 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: ds_store_b128 v86, v[30:33] offset:112 +; CHECK-NEXT: ds_store_b128 v0, v[78:81] offset:304 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: ds_store_b128 v86, v[34:37] offset:128 +; CHECK-NEXT: ds_store_b128 v0, v[82:85] offset:320 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: ds_store_b128 v86, v[38:41] offset:144 +; CHECK-NEXT: ds_store_b128 v0, v[86:89] offset:336 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: ds_store_b128 v86, v[42:45] offset:160 -; CHECK-NEXT: s_clause 0xb -; CHECK-NEXT: global_load_b128 v[2:5], v[0:1], off offset:368 -; CHECK-NEXT: global_load_b128 v[6:9], v[0:1], off offset:352 -; CHECK-NEXT: global_load_b128 v[10:13], v[0:1], off offset:336 -; CHECK-NEXT: global_load_b128 v[14:17], v[0:1], off offset:320 -; CHECK-NEXT: global_load_b128 v[18:21], v[0:1], off offset:432 -; CHECK-NEXT: global_load_b128 v[22:25], v[0:1], off offset:416 -; CHECK-NEXT: global_load_b128 v[26:29], v[0:1], off offset:400 -; CHECK-NEXT: global_load_b128 v[30:33], v[0:1], off offset:384 -; CHECK-NEXT: global_load_b128 v[34:37], v[0:1], off offset:464 -; CHECK-NEXT: global_load_b128 v[38:41], v[0:1], off offset:448 -; CHECK-NEXT: global_load_b128 v[42:45], v[0:1], off offset:480 -; CHECK-NEXT: global_load_b128 v[82:85], v[0:1], off offset:496 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: ds_store_b128 v86, v[46:49] offset:176 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: ds_store_b128 v86, v[62:65] offset:192 -; CHECK-NEXT: ds_store_b128 v86, v[58:61] offset:208 -; CHECK-NEXT: ds_store_b128 v86, v[54:57] offset:224 -; CHECK-NEXT: ds_store_b128 v86, v[50:53] offset:240 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: ds_store_b128 v86, v[78:81] offset:256 -; CHECK-NEXT: ds_store_b128 v86, v[74:77] offset:272 -; CHECK-NEXT: ds_store_b128 v86, v[70:73] offset:288 -; CHECK-NEXT: ds_store_b128 v86, v[66:69] offset:304 +; CHECK-NEXT: ds_store_b128 v0, v[90:93] offset:352 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: ds_store_b128 v86, v[14:17] offset:320 -; CHECK-NEXT: ds_store_b128 v86, v[10:13] offset:336 -; CHECK-NEXT: ds_store_b128 v86, v[6:9] offset:352 -; CHECK-NEXT: ds_store_b128 v86, v[2:5] offset:368 +; CHECK-NEXT: ds_store_b128 v0, v[94:97] offset:368 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: ds_store_b128 v0, v[98:101] offset:384 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: ds_store_b128 v0, v[102:105] offset:400 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: ds_store_b128 v0, v[106:109] offset:416 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: ds_store_b128 v86, v[30:33] offset:384 -; CHECK-NEXT: ds_store_b128 v86, v[26:29] offset:400 -; CHECK-NEXT: ds_store_b128 v86, v[22:25] offset:416 -; CHECK-NEXT: ds_store_b128 v86, v[18:21] offset:432 +; CHECK-NEXT: ds_store_b128 v0, v[110:113] offset:432 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: ds_store_b128 v0, v[114:117] offset:448 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: ds_store_b128 v86, v[38:41] offset:448 -; CHECK-NEXT: ds_store_b128 v86, v[34:37] offset:464 +; CHECK-NEXT: ds_store_b128 v0, v[118:121] offset:464 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: ds_store_b128 v86, v[42:45] offset:480 +; CHECK-NEXT: ds_store_b128 v0, v[122:125] offset:480 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_store_b128 v86, v[82:85] offset:496 +; CHECK-NEXT: ds_store_b128 v0, v[126:129] offset:496 ; CHECK-NEXT: s_endpgm call void @copy(ptr addrspace(1) %src, i32 0) call void @copy(ptr addrspace(1) %src, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -822,116 +822,116 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v4 +; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v6 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GCN-NEXT: v_xor_b32_e32 v4, v4, v9 +; GCN-NEXT: v_xor_b32_e32 v5, v5, v11 +; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v2 ; GCN-NEXT: v_xor_b32_e32 v15, v8, v9 +; GCN-NEXT: v_xor_b32_e32 v16, v10, v11 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 ; GCN-NEXT: v_cvt_f32_u32_e32 v8, v4 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v4 -; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; GCN-NEXT: v_cvt_f32_u32_e32 v10, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; GCN-NEXT: v_xor_b32_e32 v6, v6, v13 +; GCN-NEXT: v_xor_b32_e32 v17, v12, v13 +; GCN-NEXT: v_xor_b32_e32 v2, v2, v12 +; GCN-NEXT: v_cvt_f32_u32_e32 v12, v6 ; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v6 -; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v4 +; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; GCN-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 ; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 +; GCN-NEXT: v_sub_i32_e32 v11, vcc, 0, v5 ; GCN-NEXT: v_mul_lo_u32 v9, v9, v8 -; GCN-NEXT: v_xor_b32_e32 v5, v5, v11 -; GCN-NEXT: v_xor_b32_e32 v6, v6, v13 -; GCN-NEXT: v_xor_b32_e32 v16, v10, v11 +; GCN-NEXT: v_mul_lo_u32 v11, v11, v10 +; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v6 +; GCN-NEXT: v_mul_lo_u32 v13, v13, v12 ; GCN-NEXT: v_mul_hi_u32 v9, v8, v9 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 -; GCN-NEXT: v_cvt_f32_u32_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_u32_e32 v11, v6 +; GCN-NEXT: v_mul_hi_u32 v11, v10, v11 +; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GCN-NEXT: v_mul_hi_u32 v13, v12, v13 +; GCN-NEXT: v_xor_b32_e32 v7, v7, v14 +; GCN-NEXT: v_cvt_f32_u32_e32 v18, v7 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GCN-NEXT: v_rcp_iflag_f32_e32 v11, v11 +; GCN-NEXT: v_add_i32_e32 v9, vcc, v11, v10 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v8 -; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v2 -; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v10 -; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v11 +; GCN-NEXT: v_mul_hi_u32 v9, v1, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, v12, v13 +; GCN-NEXT: v_mul_hi_u32 v10, v2, v10 +; GCN-NEXT: v_rcp_iflag_f32_e32 v18, v18 ; GCN-NEXT: v_mul_lo_u32 v11, v8, v4 -; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; GCN-NEXT: v_xor_b32_e32 v17, v12, v13 -; GCN-NEXT: v_xor_b32_e32 v2, v2, v12 -; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v5 +; GCN-NEXT: v_mul_lo_u32 v13, v9, v5 +; GCN-NEXT: v_mul_lo_u32 v21, v10, v6 +; GCN-NEXT: v_mul_f32_e32 v18, 0x4f7ffffe, v18 +; GCN-NEXT: v_cvt_u32_f32_e32 v18, v18 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 -; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GCN-NEXT: v_mul_lo_u32 v12, v12, v9 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v8 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v8 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 1, v9 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[0:1] +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v21 ; GCN-NEXT: v_subrev_i32_e32 v11, vcc, v4, v0 -; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] +; GCN-NEXT: v_subrev_i32_e32 v12, vcc, v5, v1 +; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v20, s[2:3] +; GCN-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 1, v10 +; GCN-NEXT: v_subrev_i32_e32 v13, vcc, v6, v2 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] -; GCN-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v6 -; GCN-NEXT: v_mul_lo_u32 v0, v0, v10 -; GCN-NEXT: v_xor_b32_e32 v4, v7, v14 -; GCN-NEXT: v_mul_hi_u32 v7, v9, v12 -; GCN-NEXT: v_cvt_f32_u32_e32 v12, v4 -; GCN-NEXT: v_mul_hi_u32 v0, v10, v0 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GCN-NEXT: v_mul_hi_u32 v7, v1, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GCN-NEXT: v_mul_hi_u32 v0, v2, v0 -; GCN-NEXT: v_mul_lo_u32 v10, v7, v5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v4 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 -; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 -; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v7 -; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[2:3] -; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GCN-NEXT: v_cndmask_b32_e64 v10, v0, v10, s[4:5] -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v5, v1 -; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[2:3] -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v6, v2 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] -; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v7 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc -; GCN-NEXT: v_xor_b32_e32 v1, v8, v15 -; GCN-NEXT: v_xor_b32_e32 v5, v0, v16 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v15, v1 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v16, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v9, v12 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] +; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v9 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v19, v18 +; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v11, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v18, v4 +; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GCN-NEXT: v_mul_hi_u32 v5, v12, v5 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v10 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GCN-NEXT: v_mul_hi_u32 v5, v3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v18 +; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 +; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v22, s[4:5] +; GCN-NEXT: v_xor_b32_e32 v0, v0, v15 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v16 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v13, s[4:5] +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v15, v0 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v16, v1 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v10 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v5, vcc +; GCN-NEXT: v_mul_lo_u32 v5, v4, v7 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v17 -; GCN-NEXT: v_mul_lo_u32 v6, v5, v4 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v17, v2 -; GCN-NEXT: v_xor_b32_e32 v7, v8, v14 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v5 -; GCN-NEXT: v_subrev_i32_e32 v8, vcc, v4, v3 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_xor_b32_e32 v6, v8, v14 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 +; GCN-NEXT: v_subrev_i32_e32 v8, vcc, v7, v3 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v5 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 -; GCN-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v7, v3 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 +; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GCN-NEXT: v_xor_b32_e32 v3, v3, v6 +; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v6, v3 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; @@ -953,116 +953,116 @@ ; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v0 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v4 +; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v11, v5 +; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v6 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v8, v0 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1 ; TONGA-NEXT: v_xor_b32_e32 v4, v4, v9 +; TONGA-NEXT: v_xor_b32_e32 v5, v5, v11 +; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v2 ; TONGA-NEXT: v_xor_b32_e32 v15, v8, v9 +; TONGA-NEXT: v_xor_b32_e32 v16, v10, v11 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v13, v6 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 +; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10 ; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v4 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v4 -; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v5 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v12, v2 +; TONGA-NEXT: v_xor_b32_e32 v6, v6, v13 +; TONGA-NEXT: v_xor_b32_e32 v17, v12, v13 +; TONGA-NEXT: v_xor_b32_e32 v2, v2, v12 +; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v6 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v6 -; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v11, v5 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v4 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; TONGA-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 ; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8 -; TONGA-NEXT: v_add_u32_e32 v6, vcc, v13, v6 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1 +; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10 +; TONGA-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v12 +; TONGA-NEXT: v_sub_u32_e32 v11, vcc, 0, v5 ; TONGA-NEXT: v_mul_lo_u32 v9, v9, v8 -; TONGA-NEXT: v_xor_b32_e32 v5, v5, v11 -; TONGA-NEXT: v_xor_b32_e32 v6, v6, v13 -; TONGA-NEXT: v_xor_b32_e32 v16, v10, v11 +; TONGA-NEXT: v_mul_lo_u32 v11, v11, v10 +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v6 +; TONGA-NEXT: v_mul_lo_u32 v13, v13, v12 ; TONGA-NEXT: v_mul_hi_u32 v9, v8, v9 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10 -; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v5 -; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v6 +; TONGA-NEXT: v_mul_hi_u32 v11, v10, v11 +; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, v14, v7 +; TONGA-NEXT: v_mul_hi_u32 v13, v12, v13 +; TONGA-NEXT: v_xor_b32_e32 v7, v7, v14 +; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v7 ; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v9 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v11, v11 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, v11, v10 ; TONGA-NEXT: v_mul_hi_u32 v8, v0, v8 -; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v2 -; TONGA-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v10 -; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v11 +; TONGA-NEXT: v_mul_hi_u32 v9, v1, v9 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v12, v13 +; TONGA-NEXT: v_mul_hi_u32 v10, v2, v10 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v18, v18 ; TONGA-NEXT: v_mul_lo_u32 v11, v8, v4 -; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, v12, v2 -; TONGA-NEXT: v_xor_b32_e32 v17, v12, v13 -; TONGA-NEXT: v_xor_b32_e32 v2, v2, v12 -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v5 +; TONGA-NEXT: v_mul_lo_u32 v13, v9, v5 +; TONGA-NEXT: v_mul_lo_u32 v21, v10, v6 +; TONGA-NEXT: v_mul_f32_e32 v18, 0x4f7ffffe, v18 +; TONGA-NEXT: v_cvt_u32_f32_e32 v18, v18 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v11 -; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10 -; TONGA-NEXT: v_mul_lo_u32 v12, v12, v9 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v8 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v13 +; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v8 +; TONGA-NEXT: v_add_u32_e32 v20, vcc, 1, v9 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v21 ; TONGA-NEXT: v_subrev_u32_e32 v11, vcc, v4, v0 -; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] +; TONGA-NEXT: v_subrev_u32_e32 v12, vcc, v5, v1 +; TONGA-NEXT: v_cndmask_b32_e64 v9, v9, v20, s[2:3] +; TONGA-NEXT: v_sub_u32_e32 v19, vcc, 0, v7 +; TONGA-NEXT: v_add_u32_e32 v22, vcc, 1, v10 +; TONGA-NEXT: v_subrev_u32_e32 v13, vcc, v6, v2 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] -; TONGA-NEXT: v_add_u32_e32 v7, vcc, v14, v7 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, 0, v6 -; TONGA-NEXT: v_mul_lo_u32 v0, v0, v10 -; TONGA-NEXT: v_xor_b32_e32 v4, v7, v14 -; TONGA-NEXT: v_mul_hi_u32 v7, v9, v12 -; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v4 -; TONGA-NEXT: v_mul_hi_u32 v0, v10, v0 ; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v8 -; TONGA-NEXT: v_add_u32_e32 v7, vcc, v7, v9 -; TONGA-NEXT: v_mul_hi_u32 v7, v1, v7 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v10, v0 -; TONGA-NEXT: v_mul_hi_u32 v0, v2, v0 -; TONGA-NEXT: v_mul_lo_u32 v10, v7, v5 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v4 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v10 -; TONGA-NEXT: v_mul_lo_u32 v10, v0, v6 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 -; TONGA-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 -; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v12 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v10 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v7 -; TONGA-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[2:3] -; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v0 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; TONGA-NEXT: v_cndmask_b32_e64 v10, v0, v10, s[4:5] -; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v5, v1 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[2:3] -; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v6, v2 -; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] -; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v7 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc -; TONGA-NEXT: v_xor_b32_e32 v1, v8, v15 -; TONGA-NEXT: v_xor_b32_e32 v5, v0, v16 -; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v15, v1 -; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v16, v5 -; TONGA-NEXT: v_mul_lo_u32 v5, v9, v12 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] +; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v9 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; TONGA-NEXT: v_mul_lo_u32 v4, v19, v18 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v8, v11, vcc +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; TONGA-NEXT: v_mul_hi_u32 v4, v18, v4 +; TONGA-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v8, v3 -; TONGA-NEXT: v_mul_hi_u32 v5, v12, v5 ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v8 -; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v10 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v12 -; TONGA-NEXT: v_mul_hi_u32 v5, v3, v5 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v18 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v22, s[4:5] +; TONGA-NEXT: v_xor_b32_e32 v0, v0, v15 +; TONGA-NEXT: v_xor_b32_e32 v1, v1, v16 +; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v13, s[4:5] +; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v15, v0 +; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v16, v1 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v10 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; TONGA-NEXT: v_cndmask_b32_e32 v2, v10, v7, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v2, v10, v5, vcc +; TONGA-NEXT: v_mul_lo_u32 v5, v4, v7 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v17 -; TONGA-NEXT: v_mul_lo_u32 v6, v5, v4 ; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, v17, v2 -; TONGA-NEXT: v_xor_b32_e32 v7, v8, v14 -; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 -; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v5 -; TONGA-NEXT: v_subrev_u32_e32 v8, vcc, v4, v3 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 -; TONGA-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 +; TONGA-NEXT: v_xor_b32_e32 v6, v8, v14 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v4 +; TONGA-NEXT: v_subrev_u32_e32 v8, vcc, v7, v3 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v5 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 -; TONGA-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 -; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v7, v3 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v4 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 +; TONGA-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; TONGA-NEXT: v_xor_b32_e32 v3, v3, v6 +; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; TONGA-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -366,96 +366,96 @@ ; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 31, v3 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v1 -; GCN-IR-NEXT: v_sub_i32_e32 v11, vcc, v0, v4 -; GCN-IR-NEXT: v_subb_u32_e32 v12, vcc, v1, v4, vcc -; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v2 -; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v1, v5 -; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v0, v5, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[11:12] -; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v2 -; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[4:5] -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v3 -; GCN-IR-NEXT: v_min_u32_e32 v0, v0, v7 -; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v11 -; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 32, v7 -; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v12 -; GCN-IR-NEXT: v_min_u32_e32 v13, v7, v8 -; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v0, v13 -; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[7:8] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[7:8] -; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1 +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v0, v4 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v1, v4, vcc +; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 +; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 +; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 +; GCN-IR-NEXT: v_min_u32_e32 v12, v2, v3 +; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v10 +; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 +; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v11 +; GCN-IR-NEXT: v_min_u32_e32 v13, v2, v3 +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v12, v13 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v1, v5 -; GCN-IR-NEXT: v_cndmask_b32_e64 v10, v12, 0, s[6:7] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 +; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v8, v10, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v7 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v8, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], 63, v7 +; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[11:12], v7 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[10:11], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v18, vcc, -1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, -1, v3, vcc -; GCN-IR-NEXT: v_not_b32_e32 v0, v0 -; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[11:12], v14 -; GCN-IR-NEXT: v_not_b32_e32 v9, 0 -; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v0, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_not_b32_e32 v9, v12 +; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[10:11], v14 +; GCN-IR-NEXT: v_not_b32_e32 v8, 0 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v9, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v8 -; GCN-IR-NEXT: v_or_b32_e32 v0, v14, v0 -; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v18, v0 -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v19, v15, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v9 -; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, 1, v11 -; GCN-IR-NEXT: v_or_b32_e32 v7, v16, v7 -; GCN-IR-NEXT: v_and_b32_e32 v9, 1, v13 -; GCN-IR-NEXT: v_and_b32_e32 v16, v13, v3 -; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[11:12] -; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v0, v13 -; GCN-IR-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v16, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v17, v10 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v14, v14, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v16, v14 +; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v17, v15, vcc +; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v8 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v10 +; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 +; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v1 +; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v14, v12 +; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v13, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v13, v9 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v16, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v8 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: .LBB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[7:8], 1 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v3 -; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v2 +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 +; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v1 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v0 ; GCN-IR-NEXT: .LBB1_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v4 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v6 -; GCN-IR-NEXT: v_xor_b32_e32 v3, v9, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v2, v10, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v6 +; GCN-IR-NEXT: v_xor_b32_e32 v3, v8, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v2, v9, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -288,18 +288,18 @@ ; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v16 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v8 ; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 +; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 +; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 +; GCN-NEXT: v_or_b32_e32 v11, v9, v11 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 -; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9 ; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc ; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 +; GCN-NEXT: v_cndmask_b32_e64 v2, v16, v2, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc ; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v9 ; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v12 @@ -337,18 +337,18 @@ ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 ; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 +; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 +; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v16 +; GCN-NEXT: v_or_b32_e32 v11, v9, v11 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 -; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9 ; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc ; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 +; GCN-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc ; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 @@ -386,18 +386,18 @@ ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 ; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 +; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 +; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v16 +; GCN-NEXT: v_or_b32_e32 v11, v9, v11 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 -; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9 ; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc ; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 +; GCN-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc ; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -918,20 +918,20 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 -; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v6 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshl_b64 v[9:10], v[9:10], v13 -; SI-NEXT: v_lshl_b64 v[7:8], v[7:8], v11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 +; SI-NEXT: v_lshl_b64 v[6:7], v[6:7], v13 +; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], v11 +; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -614,20 +614,20 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 -; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6 -; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_ashr_i64 v[9:10], v[9:10], v13 -; SI-NEXT: v_ashr_i64 v[7:8], v[7:8], v11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 +; SI-NEXT: v_ashr_i64 v[6:7], v[6:7], v13 +; SI-NEXT: v_ashr_i64 v[4:5], v[4:5], v11 +; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v8 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: ashr_v4i64: @@ -640,20 +640,20 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s6 ; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 -; VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 -; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] -; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_ashrrev_i64 v[2:3], v10, v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ashrrev_i64 v[9:10], v13, v[9:10] -; VI-NEXT: v_ashrrev_i64 v[7:8], v11, v[7:8] -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 +; VI-NEXT: v_ashrrev_i64 v[6:7], v13, v[6:7] +; VI-NEXT: v_ashrrev_i64 v[4:5], v11, v[4:5] +; VI-NEXT: v_ashrrev_i64 v[0:1], v8, v[0:1] +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ashr_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -266,20 +266,20 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 -; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v6 -; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], v13 -; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], v11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], v13 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v11 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v8 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: lshr_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -762,49 +762,49 @@ ; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v5, v21 +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v5, v21 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v6, v22 +; GFX6-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v6, v22 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v7, v23 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v7, v23 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 +; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v8, v24 +; GFX6-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v8, v24 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 -; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v9, v25 +; GFX6-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v9, v25 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 -; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 +; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v10, v26 +; GFX6-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v10, v26 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 -; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 +; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v10, 0x80000000, v10 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc +; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v11, v27 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 @@ -881,49 +881,49 @@ ; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v5, v21 +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v5, v21 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 -; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 +; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v6, v22 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v6, v22 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 -; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 +; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v7, v23 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v7, v23 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 -; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 +; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v8, v24 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v8, v24 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 -; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v9, v25 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v9, v25 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 -; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 +; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v10, v26 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v10, v26 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 -; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 +; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v10, 0x80000000, v10 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc +; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v11, v27 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -862,43 +862,43 @@ ; GCN-NEXT: v_mul_lo_u32 v14, v10, v0 ; GCN-NEXT: v_mul_lo_u32 v16, v11, v1 ; GCN-NEXT: v_mul_lo_u32 v18, v12, v2 -; GCN-NEXT: v_mul_lo_u32 v19, v13, v3 +; GCN-NEXT: v_mul_lo_u32 v20, v13, v3 ; GCN-NEXT: v_sub_u32_e32 v4, vcc, v4, v14 ; GCN-NEXT: v_sub_u32_e32 v5, vcc, v5, v16 ; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v18 -; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v19 +; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v20 ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 -; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v12 -; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v13 +; GCN-NEXT: v_add_u32_e32 v19, vcc, 1, v12 +; GCN-NEXT: v_add_u32_e32 v21, vcc, 1, v13 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 -; GCN-NEXT: v_subrev_u32_e32 v18, vcc, v0, v4 +; GCN-NEXT: v_subrev_u32_e32 v14, vcc, v0, v4 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1] ; GCN-NEXT: v_subrev_u32_e32 v15, vcc, v1, v5 ; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3] -; GCN-NEXT: v_subrev_u32_e32 v17, vcc, v2, v6 -; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] -; GCN-NEXT: v_subrev_u32_e32 v14, vcc, v3, v7 -; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v16, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v18, s[0:1] -; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v10 +; GCN-NEXT: v_subrev_u32_e32 v16, vcc, v2, v6 +; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v19, s[4:5] +; GCN-NEXT: v_subrev_u32_e32 v17, vcc, v3, v7 +; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v21, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[0:1] +; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v10 ; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v15, s[2:3] ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v17, s[4:5] -; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v12 -; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v14, s[6:7] -; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v13 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v16, s[4:5] +; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v12 +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[6:7] +; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v13 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v16, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v14, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v15, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v16, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 -; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: s_endpgm ;