diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -636,16 +636,15 @@ [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange, - FeatureDoesNotSupportSRAMECC, FeatureDoesNotSupportXNACK] ->; + FeatureDoesNotSupportXNACK, FeatureDoesNotSupportSRAMECC]>; def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", "sea-islands", [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, - FeatureGFX7GFX8GFX9Insts, FeatureDoesNotSupportSRAMECC] ->; + FeatureGFX7GFX8GFX9Insts, FeatureDoesNotSupportXNACK, + FeatureDoesNotSupportSRAMECC]>; def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", "volcanic-islands", @@ -655,10 +654,8 @@ FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP, - FeatureIntClamp, FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC, - FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts - ] ->; + FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts, + FeatureGFX7GFX8GFX9Insts, FeatureDoesNotSupportSRAMECC]>; def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", "gfx9", @@ -671,9 +668,7 @@ FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, - FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16 - ] ->; + FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16]>; def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", "gfx10", @@ -687,11 +682,8 @@ FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts, FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, - FeatureVOP3Literal, FeatureDPP8, - FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC, - FeatureGFX10A16 - ] ->; + FeatureVOP3Literal, FeatureDPP8, FeatureNoDataDepHazard, + FeaturePkFmacF16Inst, FeatureGFX10A16, FeatureDoesNotSupportSRAMECC]>; class FeatureSet Features_> { list Features = Features_; @@ -701,19 +693,16 @@ FeatureFastFMAF32, HalfRate64Ops, FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; def FeatureISAVersion6_0_1 : FeatureSet< [FeatureSouthernIslands, FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; def FeatureISAVersion7_0_0 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; def FeatureISAVersion7_0_1 : FeatureSet< @@ -721,26 +710,22 @@ HalfRate64Ops, FeatureLDSBankCount32, FeatureFastFMAF32, - FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; def FeatureISAVersion7_0_2 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount16, FeatureFastFMAF32, - FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; def FeatureISAVersion7_0_3 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount16, - FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; def FeatureISAVersion7_0_4 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount32, - FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; def FeatureISAVersion8_0_1 : FeatureSet< @@ -748,54 +733,54 @@ FeatureFastFMAF32, HalfRate64Ops, FeatureLDSBankCount32, - FeatureXNACK, FeatureUnpackedD16VMem, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureXNACK]>; def FeatureISAVersion8_0_2 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount32, FeatureSGPRInitBug, FeatureUnpackedD16VMem, - FeatureDoesNotSupportXNACK, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureXNACK]>; def FeatureISAVersion8_0_3 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount32, FeatureUnpackedD16VMem, - FeatureDoesNotSupportXNACK, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureXNACK]>; def FeatureISAVersion8_1_0 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount16, - FeatureXNACK, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureXNACK]>; def FeatureISAVersion9_0_0 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, FeatureCodeObjectV3, - FeatureDoesNotSupportXNACK, + FeatureXNACK, FeatureDoesNotSupportSRAMECC]>; def FeatureISAVersion9_0_2 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, + FeatureCodeObjectV3, FeatureXNACK, - FeatureDoesNotSupportSRAMECC, - FeatureCodeObjectV3]>; + FeatureDoesNotSupportSRAMECC]>; def FeatureISAVersion9_0_4 : FeatureSet< [FeatureGFX9, FeatureLDSBankCount32, FeatureFmaMixInsts, - FeatureDoesNotSupportXNACK, - FeatureDoesNotSupportSRAMECC, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureXNACK, + FeatureDoesNotSupportSRAMECC]>; def FeatureISAVersion9_0_6 : FeatureSet< [FeatureGFX9, @@ -805,8 +790,9 @@ FeatureDLInsts, FeatureDot1Insts, FeatureDot2Insts, - FeatureDoesNotSupportXNACK, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureXNACK, + FeatureSRAMECC]>; def FeatureISAVersion9_0_8 : FeatureSet< [FeatureGFX9, @@ -825,14 +811,16 @@ FeatureAtomicFaddInsts, FeatureSRAMECC, FeatureMFMAInlineLiteralBug, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureXNACK, + FeatureSRAMECC]>; def FeatureISAVersion9_0_9 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureXNACK, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureXNACK]>; // TODO: Organize more features into groups. def FeatureGroup { @@ -861,8 +849,8 @@ FeatureScalarAtomics, FeatureScalarFlatScratchInsts, FeatureLdsMisalignedBug, - FeatureDoesNotSupportXNACK, - FeatureCodeObjectV3])>; + FeatureCodeObjectV3, + FeatureXNACK])>; def FeatureISAVersion10_1_1 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -878,8 +866,8 @@ FeatureScalarStores, FeatureScalarAtomics, FeatureScalarFlatScratchInsts, - FeatureDoesNotSupportXNACK, - FeatureCodeObjectV3])>; + FeatureCodeObjectV3, + FeatureXNACK])>; def FeatureISAVersion10_1_2 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -896,8 +884,8 @@ FeatureScalarAtomics, FeatureScalarFlatScratchInsts, FeatureLdsMisalignedBug, - FeatureDoesNotSupportXNACK, - FeatureCodeObjectV3])>; + FeatureCodeObjectV3, + FeatureXNACK])>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -150,17 +150,22 @@ HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; - // Disable XNACK on targets where it is not enabled by default unless it is - // explicitly requested. - if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { + if (DoesNotSupportXNACK && EnableXNACK) { + if (FS.contains("+xnack")) + report_fatal_error("XNACK was requested for a processor that does not support it!"); + ToggleFeature(AMDGPU::FeatureXNACK); EnableXNACK = false; } + // ECC is on by default, but turn it off if the hardware doesn't support it // anyway. This matters for the gfx9 targets with d16 loads, but don't support // ECC. if (DoesNotSupportSRAMECC && EnableSRAMECC) { + if (FS.contains("+sram-ecc")) + report_fatal_error("SRAM ECC was requested for a processor that does not support it!"); + ToggleFeature(AMDGPU::FeatureSRAMECC); EnableSRAMECC = false; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -12,6 +12,8 @@ ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: ; %if.true +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_dword v0, v[0:1], off ; CHECK-NEXT: BB0_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] @@ -39,6 +41,8 @@ ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_cbranch_execnz BB1_2 ; CHECK-NEXT: ; %bb.1: ; %if.true +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_dword v0, v[0:1], off ; CHECK-NEXT: BB1_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] @@ -68,6 +72,8 @@ ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_cbranch_execz BB2_2 ; CHECK-NEXT: ; %bb.1: ; %if.true +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_dword v0, v[0:1], off ; CHECK-NEXT: BB2_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] @@ -99,6 +105,8 @@ ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_cbranch_execz BB3_2 ; CHECK-NEXT: ; %bb.1: ; %if.true +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_dword v0, v[0:1], off ; CHECK-NEXT: BB3_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] @@ -131,6 +139,7 @@ ; CHECK-NEXT: s_add_u32 s4, s4, external_constant@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, external_constant@gotpcrel32@hi+4 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -150,8 +159,10 @@ ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 -; CHECK-NEXT: flat_load_dword v0, v[0:1] ; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_load_dword v0, v[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0 ; CHECK-NEXT: s_xor_b64 s[8:9], vcc, s[6:7] @@ -165,6 +176,8 @@ ; CHECK-NEXT: s_cbranch_execz BB4_5 ; CHECK-NEXT: ; %bb.4: ; %bb11 ; CHECK-NEXT: v_mov_b32_e32 v0, 4.0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: buffer_store_dword v0, v0, s[0:3], s33 offen ; CHECK-NEXT: BB4_5: ; %Flow ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] @@ -215,8 +228,9 @@ ; CHECK-NEXT: s_cbranch_vccnz BB5_3 ; CHECK-NEXT: ; %bb.2: ; %bb4 ; CHECK-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; CHECK-NEXT: global_load_dword v2, v[0:1], off ; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 1 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: global_load_dword v2, v[0:1], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, v0, v2 ; CHECK-NEXT: s_xor_b64 s[2:3], vcc, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -316,6 +316,8 @@ ; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[4:5] ; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GPRIDX-NEXT: s_endpgm ; @@ -333,6 +335,8 @@ ; MOVREL-NEXT: s_movrels_b64 s[0:1], s[4:5] ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; MOVREL-NEXT: s_endpgm entry: @@ -374,6 +378,8 @@ ; GPRIDX-NEXT: s_cbranch_execnz BB8_1 ; GPRIDX-NEXT: ; %bb.2: ; GPRIDX-NEXT: s_mov_b64 exec, s[16:17] +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: global_store_dwordx2 v[0:1], v[1:2], off ; GPRIDX-NEXT: s_endpgm ; @@ -409,6 +415,8 @@ ; MOVREL-NEXT: s_cbranch_execnz BB8_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b64 exec, s[16:17] +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: flat_store_dwordx2 v[0:1], v[1:2] ; MOVREL-NEXT: s_endpgm entry: @@ -474,6 +482,7 @@ ; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) ; GPRIDX-NEXT: v_mov_b32_e32 v17, v1 ; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: global_store_dwordx2 v[0:1], v[16:17], off ; GPRIDX-NEXT: s_endpgm ; @@ -482,6 +491,8 @@ ; MOVREL-NEXT: s_lshl_b32 m0, s2, 1 ; MOVREL-NEXT: v_movrels_b32_e32 v16, v0 ; MOVREL-NEXT: v_movrels_b32_e32 v17, v1 +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: flat_store_dwordx2 v[0:1], v[16:17] ; MOVREL-NEXT: s_endpgm entry: @@ -513,6 +524,8 @@ ; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[0:1] ; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GPRIDX-NEXT: s_endpgm ; @@ -538,6 +551,8 @@ ; MOVREL-NEXT: s_movrels_b64 s[0:1], s[0:1] ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; MOVREL-NEXT: s_endpgm entry: @@ -1223,6 +1238,8 @@ ; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[0:1] ; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GPRIDX-NEXT: s_endpgm ; @@ -1248,6 +1265,8 @@ ; MOVREL-NEXT: s_movrels_b64 s[0:1], s[0:1] ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; MOVREL-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -714,6 +714,8 @@ ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GPRIDX-NEXT: s_mov_b32 s8, 0 ; GPRIDX-NEXT: s_mov_b32 s19, 0x40200000 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GPRIDX-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GPRIDX-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill @@ -776,6 +778,8 @@ ; GPRIDX-NEXT: s_cbranch_execnz BB13_1 ; GPRIDX-NEXT: ; %bb.2: ; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off @@ -803,6 +807,8 @@ ; MOVREL-NEXT: s_mov_b32 s17, 0x401c0000 ; MOVREL-NEXT: s_mov_b32 s16, s8 ; MOVREL-NEXT: s_mov_b32 s18, s8 +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; MOVREL-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; MOVREL-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill @@ -851,6 +857,8 @@ ; MOVREL-NEXT: s_cbranch_execnz BB13_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s4 +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off @@ -941,6 +949,8 @@ ; GPRIDX-NEXT: s_cbranch_execnz BB14_1 ; GPRIDX-NEXT: ; %bb.2: ; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[1:4], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[5:8], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[9:12], off @@ -1010,6 +1020,8 @@ ; MOVREL-NEXT: s_cbranch_execnz BB14_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[1:4], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[5:8], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[9:12], off @@ -1114,6 +1126,8 @@ ; MOVREL-NEXT: v_movreld_b32_e32 v2, v0 ; MOVREL-NEXT: v_movreld_b32_e32 v3, v1 ; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off @@ -1154,6 +1168,8 @@ ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: v_movreld_b32_e32 v0, s2 ; MOVREL-NEXT: v_movreld_b32_e32 v1, s3 +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off @@ -1239,6 +1255,8 @@ ; GPRIDX-NEXT: s_cbranch_execnz BB17_1 ; GPRIDX-NEXT: ; %bb.2: ; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off @@ -1308,6 +1326,8 @@ ; MOVREL-NEXT: s_cbranch_execnz BB17_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off @@ -1361,6 +1381,8 @@ ; GPRIDX-NEXT: s_cbranch_execnz BB18_1 ; GPRIDX-NEXT: ; %bb.2: ; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[17:20], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[21:24], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[25:28], off @@ -1398,6 +1420,8 @@ ; MOVREL-NEXT: s_cbranch_execnz BB18_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[17:20], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[21:24], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[25:28], off @@ -1438,6 +1462,8 @@ ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: v_movreld_b32_e32 v0, v16 ; MOVREL-NEXT: v_movreld_b32_e32 v1, v17 +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off @@ -1491,6 +1517,8 @@ ; GPRIDX-NEXT: s_cbranch_execnz BB20_1 ; GPRIDX-NEXT: ; %bb.2: ; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[19:22], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[23:26], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[27:30], off @@ -1528,6 +1556,8 @@ ; MOVREL-NEXT: s_cbranch_execnz BB20_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[19:22], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[23:26], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[27:30], off @@ -2010,24 +2040,32 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s3 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: v_mov_b32_e32 v0, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s7 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: v_mov_b32_e32 v0, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s10 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s11 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: v_mov_b32_e32 v0, s12 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s15 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GPRIDX-NEXT: s_endpgm ; @@ -2068,6 +2106,8 @@ ; MOVREL-NEXT: v_mov_b32_e32 v14, s14 ; MOVREL-NEXT: v_mov_b32_e32 v15, s15 ; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off @@ -2123,6 +2163,8 @@ ; GPRIDX-NEXT: s_cbranch_execnz BB32_1 ; GPRIDX-NEXT: ; %bb.2: ; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[19:22], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[23:26], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[27:30], off @@ -2161,6 +2203,8 @@ ; MOVREL-NEXT: s_cbranch_execnz BB32_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: s_nop 0 +; MOVREL-NEXT: s_nop 0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[19:22], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[23:26], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[27:30], off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s ; FIXME: Merge with other test. DS offset folding doesn't work due to ; register bank copies, and no return optimization is missing. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-xnack -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; FIXME: Merge with other test. DS offset folding doesn't work due to ; register bank copies, and no return optimization is missing. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.gws.barrier.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.gws.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.gws.barrier.ll @@ -1,9 +1,9 @@ ; XUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %S/../llvm.amdgcn.ds.gws.barrier.ll | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %S/../llvm.amdgcn.ds.gws.barrier.ll ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %S/../llvm.amdgcn.ds.gws.barrier.ll | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %S/../llvm.amdgcn.ds.gws.barrier.ll -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %S/../llvm.amdgcn.ds.gws.barrier.ll | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %S/../llvm.amdgcn.ds.gws.barrier.ll -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %S/../llvm.amdgcn.ds.gws.barrier.ll | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL %S/../llvm.amdgcn.ds.gws.barrier.ll -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -o - -verify-machineinstrs < %S/../llvm.amdgcn.ds.gws.barrier.ll | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL,GFX10 %S/../llvm.amdgcn.ds.gws.barrier.ll +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -mattr=-xnack -o - -verify-machineinstrs < %S/../llvm.amdgcn.ds.gws.barrier.ll | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %S/../llvm.amdgcn.ds.gws.barrier.ll +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -mattr=-xnack -o - -verify-machineinstrs < %S/../llvm.amdgcn.ds.gws.barrier.ll | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL %S/../llvm.amdgcn.ds.gws.barrier.ll +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=-xnack -asm-verbose=0 -o - -verify-machineinstrs < %S/../llvm.amdgcn.ds.gws.barrier.ll | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL,GFX10 %S/../llvm.amdgcn.ds.gws.barrier.ll ; Make sure the op is emitted bundled with a waitcnt with and without the retry loop, and the bundle is not removed by ExpandPostRAPseudos. ; XUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos -o - -verify-machineinstrs < %S/../llvm.amdgcn.ds.gws.barrier.ll | FileCheck -enable-var-scope -check-prefix=MIR %S/../llvm.amdgcn.ds.gws.barrier.ll -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos -o - -verify-machineinstrs < %S/../llvm.amdgcn.ds.gws.barrier.ll | FileCheck -enable-var-scope -check-prefix=MIR %S/../llvm.amdgcn.ds.gws.barrier.ll +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -mattr=-xnack -stop-after=postrapseudos -o - -verify-machineinstrs < %S/../llvm.amdgcn.ds.gws.barrier.ll | FileCheck -enable-var-scope -check-prefix=MIR %S/../llvm.amdgcn.ds.gws.barrier.ll diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -15,11 +15,15 @@ ; GCN-NEXT: s_cbranch_scc1 BB0_2 ; GCN-NEXT: ; %bb.1: ; %mid ; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: BB0_2: ; %bb ; GCN-NEXT: v_nop ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll @@ -14,10 +14,14 @@ ; GCN-NEXT: s_cbranch_scc1 BB0_2 ; GCN-NEXT: ; %bb.1: ; %mid ; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: BB0_2: ; %bb ; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll @@ -14,6 +14,8 @@ ; GCN-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s ; TODO: Merge with DAG test diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s ; TODO: Merge with DAG test diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -8,26 +8,30 @@ ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; encoding: [0x80,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x2c ; encoding: [0x00,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; encoding: [0x00,0x01,0x00,0xf4,0x2c,0x00,0x00,0xfa] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; encoding: [0x00,0x02,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; encoding: [0x04,0x02,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x04,0x7e,0x02,0x01,0x08,0x11] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x00,0x02,0x7d,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 true) #0 @@ -45,6 +49,8 @@ ; GFX8-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -59,6 +65,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; encoding: [0x00,0x02,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11] ; GFX10-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x7d,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] %tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 %in1, i32 1, i32 1, i32 1, i1 false) #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -6,28 +6,32 @@ ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) @@ -45,13 +49,17 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_endpgm ; @@ -68,10 +76,14 @@ ; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: v_add_co_u32_e64 v6, vcc_lo, v2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dwordx2 v[6:7], v[4:5], off ; GFX10-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -14,30 +14,54 @@ ; GFX9-NEXT: s_cbranch_scc0 BB0_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e7 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c7 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5be6 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: BB0_2: ; %bb1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5be6 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c7 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e7 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_endpgm entry: @@ -99,14 +123,18 @@ ; GFX9-NEXT: s_addc_u32 s3, s3, gv3@gotpcrel32@hi+4 ; GFX9-NEXT: BB1_3: ; %bb2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 1 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_endpgm entry: @@ -152,10 +180,14 @@ ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, static.gv3@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, static.gv3@rel32@hi+4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: BB2_2: ; %Flow ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] @@ -171,10 +203,14 @@ ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, static.gv1@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, static.gv1@rel32@hi+4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: BB2_4: ; %bb2 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/add.i16.ll b/llvm/test/CodeGen/AMDGPU/add.i16.ll --- a/llvm/test/CodeGen/AMDGPU/add.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.i16.ll @@ -5,7 +5,7 @@ ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; VI-NEXT: buffer_store_short [[ADD]] +; VI: buffer_store_short [[ADD]] define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid @@ -22,7 +22,7 @@ ; GCN-LABEL: {{^}}v_test_add_i16_constant: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]] -; VI-NEXT: buffer_store_short [[ADD]] +; VI: buffer_store_short [[ADD]] define amdgpu_kernel void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid @@ -37,7 +37,7 @@ ; GCN-LABEL: {{^}}v_test_add_i16_neg_constant: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]] -; VI-NEXT: buffer_store_short [[ADD]] +; VI: buffer_store_short [[ADD]] define amdgpu_kernel void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid @@ -52,7 +52,7 @@ ; GCN-LABEL: {{^}}v_test_add_i16_inline_neg1: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]] -; VI-NEXT: buffer_store_short [[ADD]] +; VI: buffer_store_short [[ADD]] define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid @@ -68,7 +68,7 @@ ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; VI-NEXT: buffer_store_dword [[ADD]] +; VI: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid @@ -107,7 +107,7 @@ ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 -; VI-NEXT: buffer_store_dword [[SEXT]] +; VI: buffer_store_dword [[SEXT]] define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid @@ -128,7 +128,7 @@ ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] -; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; VI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -260,8 +260,8 @@ ; GFX9: v_pk_add_u16 ; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; VI: v_add_u16_sdwa ; VI: v_add_u16_e32 +; VI: v_add_u16_sdwa ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -425,6 +425,8 @@ ; GCN-NEXT: s_and_b32 s0, 1, s0 ; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: flat_store_short v[0:1], v0 ; GCN-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll @@ -15,6 +15,8 @@ ; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dword v[0:1], v2, off ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -2,9 +2,9 @@ ; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s ; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC ; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s -; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE-VECT -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access,-xnack < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access,-xnack < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE-VECT -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access,-xnack < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -data-layout=A5 -mcpu=kaveri -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -enable-var-scope -check-prefix=HSAOPT -check-prefix=OPT %s ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -data-layout=A5 -mcpu=kaveri -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -enable-var-scope -check-prefix=NOHSAOPT -check-prefix=OPT %s diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=+trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-ENABLE -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=+trap-handler,-xnack < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-ENABLE +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler,-xnack < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs ; TRAP-HANDLER-ENABLE: NumSgprs: 60 diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -62,6 +62,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v1, v2, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] @@ -91,6 +92,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v1, v2, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -121,6 +123,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v1, v2, v1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB0_2: @@ -153,6 +156,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v2, v1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB0_2: @@ -211,135 +215,148 @@ ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz BB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[6:7] +; GFX8-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s1, s0, s1 +; GFX8-NEXT: s_mul_i32 s3, s2, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[6:7] +; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s1, s0, s1 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 +; GFX1064-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s1, s0, s1 -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: s_mul_i32 s3, s2, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB1_2: ; GFX1064-NEXT: v_nop -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1064-NEXT: s_nop 0 +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: v_cmp_ne_u32_e64 s1, 1, 0 ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB1_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB1_2: ; GFX1032-NEXT: v_nop -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1032-NEXT: s_nop 0 +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -413,6 +430,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB2_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] @@ -423,6 +441,7 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -465,6 +484,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB2_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] @@ -475,6 +495,7 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -523,6 +544,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB2_2: @@ -575,6 +597,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB2_2: @@ -652,6 +675,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB3_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] @@ -662,6 +686,7 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -704,6 +729,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB3_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] @@ -714,6 +740,7 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -762,6 +789,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB3_2: @@ -814,6 +842,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB3_2: @@ -891,6 +920,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] @@ -901,6 +931,7 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -943,6 +974,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] @@ -953,6 +985,7 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1001,6 +1034,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB4_2: @@ -1053,6 +1087,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB4_2: @@ -1130,6 +1165,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1163,6 +1199,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1197,6 +1234,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB5_2: @@ -1231,6 +1269,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB5_2: @@ -1322,6 +1361,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB6_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1339,6 +1379,8 @@ ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; @@ -1365,6 +1407,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB6_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1382,6 +1425,8 @@ ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -1409,6 +1454,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB6_2: @@ -1425,6 +1471,8 @@ ; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 ; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s4, v0 ; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s5, v1, vcc +; GFX1064-NEXT: s_nop 0 +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -1452,6 +1500,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB6_2: @@ -1468,6 +1517,8 @@ ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 ; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s4, v0 ; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo +; GFX1032-NEXT: s_nop 0 +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -1506,9 +1557,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -1520,9 +1574,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1537,8 +1594,10 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -1554,8 +1613,10 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -1617,6 +1678,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v2, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB8_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1627,6 +1689,7 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -1647,6 +1710,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v2, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1657,6 +1721,7 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1678,6 +1743,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v2, v1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB8_2: @@ -1690,6 +1756,7 @@ ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -1711,6 +1778,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v2, v1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB8_2: @@ -1723,6 +1791,7 @@ ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -1770,135 +1839,148 @@ ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz BB9_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[6:7] +; GFX8-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s1, s0, s1 +; GFX8-NEXT: s_mul_i32 s3, s2, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB9_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB9_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[6:7] +; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s1, s0, s1 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB9_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 +; GFX1064-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s1, s0, s1 -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: s_mul_i32 s3, s2, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB9_2: ; GFX1064-NEXT: v_nop -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 ; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1064-NEXT: s_nop 0 +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: v_cmp_ne_u32_e64 s1, 1, 0 ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB9_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB9_2: ; GFX1032-NEXT: v_nop -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 ; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1032-NEXT: s_nop 0 +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -1972,6 +2054,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB10_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1982,6 +2065,7 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -2024,6 +2108,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB10_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] @@ -2034,6 +2119,7 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -2082,6 +2168,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB10_2: @@ -2134,6 +2221,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB10_2: @@ -2211,6 +2299,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB11_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] @@ -2223,7 +2312,9 @@ ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -2245,6 +2336,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB11_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -2257,7 +2349,9 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -2280,6 +2374,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB11_2: @@ -2293,7 +2388,9 @@ ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -2316,6 +2413,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB11_2: @@ -2329,7 +2427,9 @@ ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -2409,6 +2509,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB12_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] @@ -2426,6 +2527,8 @@ ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; @@ -2452,6 +2555,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB12_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] @@ -2469,6 +2573,8 @@ ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -2496,6 +2602,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB12_2: @@ -2512,6 +2619,8 @@ ; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 ; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s4, v0 ; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc +; GFX1064-NEXT: s_nop 0 +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -2539,6 +2648,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB12_2: @@ -2555,6 +2665,8 @@ ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 ; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s4, v0 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo +; GFX1032-NEXT: s_nop 0 +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -2593,9 +2705,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -2607,9 +2722,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -2624,8 +2742,10 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -2641,8 +2761,10 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -2713,6 +2835,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB14_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] @@ -2723,6 +2846,7 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -2765,6 +2889,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB14_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] @@ -2775,6 +2900,7 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -2823,6 +2949,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB14_2: @@ -2875,6 +3002,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB14_2: @@ -2955,6 +3083,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB15_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] @@ -2965,6 +3094,7 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -3007,6 +3137,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB15_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] @@ -3017,6 +3148,7 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3065,6 +3197,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB15_2: @@ -3117,6 +3250,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB15_2: @@ -3197,6 +3331,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB16_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] @@ -3207,6 +3342,7 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -3249,6 +3385,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB16_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] @@ -3259,6 +3396,7 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3307,6 +3445,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB16_2: @@ -3359,6 +3498,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB16_2: @@ -3439,6 +3579,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB17_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] @@ -3449,6 +3590,7 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -3491,6 +3633,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB17_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] @@ -3501,6 +3644,7 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3549,6 +3693,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB17_2: @@ -3601,6 +3746,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB17_2: @@ -3679,6 +3825,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB18_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] @@ -3694,7 +3841,9 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -3715,6 +3864,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB18_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -3730,7 +3880,9 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3752,6 +3904,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB18_2: @@ -3766,7 +3919,9 @@ ; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -3788,6 +3943,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB18_2: @@ -3802,7 +3958,9 @@ ; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[0:1] ; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -3871,6 +4029,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB19_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] @@ -3881,6 +4040,7 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -3923,6 +4083,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB19_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] @@ -3933,6 +4094,7 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3981,6 +4143,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB19_2: @@ -4033,6 +4196,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB19_2: @@ -4111,6 +4275,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB20_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] @@ -4126,7 +4291,9 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -4147,6 +4314,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB20_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -4162,7 +4330,9 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -4184,6 +4354,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB20_2: @@ -4198,7 +4369,9 @@ ; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -4220,6 +4393,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB20_2: @@ -4234,7 +4408,9 @@ ; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1] ; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -4303,6 +4479,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB21_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] @@ -4313,6 +4490,7 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -4355,6 +4533,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB21_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] @@ -4365,6 +4544,7 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -4413,6 +4593,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB21_2: @@ -4465,6 +4646,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB21_2: @@ -4542,6 +4724,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB22_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] @@ -4556,7 +4739,9 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -4577,6 +4762,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB22_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -4591,7 +4777,9 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -4613,6 +4801,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB22_2: @@ -4627,7 +4816,9 @@ ; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -4649,6 +4840,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB22_2: @@ -4663,7 +4855,9 @@ ; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc_lo +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -4732,6 +4926,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB23_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] @@ -4742,6 +4937,7 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -4784,6 +4980,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB23_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] @@ -4794,6 +4991,7 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -4842,6 +5040,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB23_2: @@ -4894,6 +5093,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB23_2: @@ -4971,6 +5171,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB24_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] @@ -4985,7 +5186,9 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -5006,6 +5209,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB24_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -5020,7 +5224,9 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -5042,6 +5248,7 @@ ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB24_2: @@ -5056,7 +5263,9 @@ ; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -5078,6 +5287,7 @@ ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB24_2: @@ -5092,7 +5302,9 @@ ; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -16,6 +16,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -41,9 +42,12 @@ ; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -69,9 +73,12 @@ ; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=+code-object-v3 < %s | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -amdgpu-verify-hsa-metadata -filetype=obj -mattr=+code-object-v3 -o /dev/null < %s 2>&1 | FileCheck --check-prefix=PARSER %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=+code-object-v3,-xnack < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -amdgpu-verify-hsa-metadata -filetype=obj -mattr=+code-object-v3,-xnack -o /dev/null < %s 2>&1 | FileCheck --check-prefix=PARSER %s ; CHECK-LABEL: {{^}}min_64_max_64: ; CHECK: SGPRBlocks: 0 diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readelf --notes | FileCheck --check-prefix=HSAMD %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-xnack < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3,-xnack -filetype=obj -o - < %s | llvm-readelf --notes | FileCheck --check-prefix=HSAMD %s ; CHECK-LABEL: {{^}}min_64_max_64: ; CHECK: SGPRBlocks: 0 diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s ; FIXME: Vectorization can increase required SGPR count beyond limit. diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-xnack -verify-machineinstrs < %s | FileCheck %s ; Exactly 1 wave per execution unit. ; CHECK-LABEL: {{^}}empty_exactly_1: diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll --- a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; The bitcast should be pushed through the bitcasts so the vectors can ; be broken down and the shared components can be CSEd diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -32,13 +32,15 @@ ; FLAT-LABEL: s_brev_i16: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dword s0, s[0:1], 0x2c +; FLAT-NEXT: s_load_dword s2, s[0:1], 0x2c ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b32 s0, s0 +; FLAT-NEXT: s_brev_b32 s0, s2 ; FLAT-NEXT: s_lshr_b32 s0, s0, 16 ; FLAT-NEXT: v_mov_b32_e32 v0, s0 +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: s_nop 0 ; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0 ; FLAT-NEXT: s_endpgm %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 @@ -66,16 +68,20 @@ ; FLAT-LABEL: v_brev_i16: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 -; FLAT-NEXT: s_mov_b32 s2, s6 -; FLAT-NEXT: s_mov_b32 s3, s7 +; FLAT-NEXT: s_mov_b32 s10, s6 +; FLAT-NEXT: s_mov_b32 s11, s7 +; FLAT-NEXT: s_nop 0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: s_nop 0 ; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0 ; FLAT-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %valptr @@ -100,12 +106,14 @@ ; FLAT-LABEL: s_brev_i32: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dword s0, s[0:1], 0x2c +; FLAT-NEXT: s_load_dword s2, s[0:1], 0x2c ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b32 s0, s0 +; FLAT-NEXT: s_brev_b32 s0, s2 ; FLAT-NEXT: v_mov_b32_e32 v0, s0 +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: s_nop 0 ; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; FLAT-NEXT: s_endpgm %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 @@ -134,17 +142,21 @@ ; FLAT-LABEL: v_brev_i32: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: s_nop 0 ; FLAT-NEXT: flat_load_dword v0, v[0:1] ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: s_nop 0 ; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; FLAT-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -173,14 +185,16 @@ ; FLAT-LABEL: s_brev_v2i32: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b32 s1, s1 -; FLAT-NEXT: s_brev_b32 s0, s0 -; FLAT-NEXT: v_mov_b32_e32 v0, s0 -; FLAT-NEXT: v_mov_b32_e32 v1, s1 +; FLAT-NEXT: s_brev_b32 s0, s3 +; FLAT-NEXT: s_brev_b32 s1, s2 +; FLAT-NEXT: v_mov_b32_e32 v0, s1 +; FLAT-NEXT: v_mov_b32_e32 v1, s0 +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: s_nop 0 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 @@ -210,18 +224,22 @@ ; FLAT-LABEL: v_brev_v2i32: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: s_nop 0 ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: s_nop 0 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -320,6 +338,8 @@ ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: v_or_b32_e32 v0, v2, v0 ; FLAT-NEXT: v_or_b32_e32 v1, v3, v1 +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: s_nop 0 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; FLAT-NEXT: s_endpgm %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 @@ -384,26 +404,28 @@ ; FLAT-LABEL: v_brev_i64: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; FLAT-NEXT: s_mov_b32 s2, 0x10203 -; FLAT-NEXT: s_mov_b32 s3, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s0, 0x10203 +; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f ; FLAT-NEXT: s_mov_b32 s6, 0xf0f0f0f0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0x33333333 -; FLAT-NEXT: s_mov_b32 s1, 0xcccccccc +; FLAT-NEXT: s_mov_b32 s2, 0x33333333 +; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc ; FLAT-NEXT: s_mov_b32 s8, 0x55555555 ; FLAT-NEXT: s_mov_b32 s9, 0xaaaaaaaa ; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; FLAT-NEXT: v_perm_b32 v2, 0, v0, s2 -; FLAT-NEXT: v_perm_b32 v4, 0, v1, s2 -; FLAT-NEXT: v_and_b32_e32 v1, s3, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s3, v4 +; FLAT-NEXT: v_perm_b32 v2, 0, v0, s0 +; FLAT-NEXT: v_perm_b32 v4, 0, v1, s0 +; FLAT-NEXT: v_and_b32_e32 v1, s1, v2 +; FLAT-NEXT: v_and_b32_e32 v0, s1, v4 ; FLAT-NEXT: v_and_b32_e32 v3, s6, v2 ; FLAT-NEXT: v_and_b32_e32 v2, s6, v4 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] @@ -411,10 +433,10 @@ ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s0, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s1, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s3, v2 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 @@ -427,6 +449,8 @@ ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; FLAT-NEXT: v_or_b32_e32 v1, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v0, v2, v0 +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: s_nop 0 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -518,51 +542,51 @@ ; FLAT-LABEL: s_brev_v2i64: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; FLAT-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; FLAT-NEXT: v_mov_b32_e32 v4, 0x10203 -; FLAT-NEXT: s_mov_b32 s8, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s9, 0xcccccccc -; FLAT-NEXT: s_mov_b32 s10, 0x55555555 +; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f0 +; FLAT-NEXT: s_mov_b32 s1, 0x33333333 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_perm_b32 v3, 0, s2, v4 -; FLAT-NEXT: v_perm_b32 v2, 0, s3, v4 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; FLAT-NEXT: v_and_b32_e32 v0, s8, v2 -; FLAT-NEXT: v_and_b32_e32 v1, s8, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s2, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s2, v3 +; FLAT-NEXT: v_perm_b32 v3, 0, s10, v4 +; FLAT-NEXT: v_perm_b32 v2, 0, s11, v4 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s0, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s0, v3 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] -; FLAT-NEXT: v_perm_b32 v7, 0, s0, v4 -; FLAT-NEXT: v_perm_b32 v6, 0, s1, v4 +; FLAT-NEXT: v_perm_b32 v7, 0, s8, v4 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: s_mov_b32 s3, 0x33333333 ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_and_b32_e32 v0, s3, v2 -; FLAT-NEXT: v_and_b32_e32 v1, s3, v3 -; FLAT-NEXT: v_and_b32_e32 v4, s8, v6 -; FLAT-NEXT: v_and_b32_e32 v5, s8, v7 -; FLAT-NEXT: v_and_b32_e32 v2, s9, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s9, v3 -; FLAT-NEXT: v_and_b32_e32 v6, s2, v6 -; FLAT-NEXT: v_and_b32_e32 v7, s2, v7 +; FLAT-NEXT: v_perm_b32 v6, 0, s9, v4 +; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc +; FLAT-NEXT: v_and_b32_e32 v0, s1, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s1, v3 +; FLAT-NEXT: v_and_b32_e32 v4, s2, v6 +; FLAT-NEXT: v_and_b32_e32 v5, s2, v7 +; FLAT-NEXT: v_and_b32_e32 v2, s3, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 +; FLAT-NEXT: v_and_b32_e32 v6, s0, v6 +; FLAT-NEXT: v_and_b32_e32 v7, s0, v7 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5] ; FLAT-NEXT: v_lshrrev_b64 v[6:7], 4, v[6:7] ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 ; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 +; FLAT-NEXT: s_mov_b32 s10, 0x55555555 +; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: s_mov_b32 s11, 0xaaaaaaaa ; FLAT-NEXT: v_and_b32_e32 v0, s10, v2 ; FLAT-NEXT: v_and_b32_e32 v1, s10, v3 -; FLAT-NEXT: v_and_b32_e32 v4, s3, v6 -; FLAT-NEXT: v_and_b32_e32 v5, s3, v7 +; FLAT-NEXT: v_and_b32_e32 v4, s1, v6 +; FLAT-NEXT: v_and_b32_e32 v5, s1, v7 ; FLAT-NEXT: v_and_b32_e32 v2, s11, v2 ; FLAT-NEXT: v_and_b32_e32 v3, s11, v3 -; FLAT-NEXT: v_and_b32_e32 v6, s9, v6 -; FLAT-NEXT: v_and_b32_e32 v7, s9, v7 +; FLAT-NEXT: v_and_b32_e32 v6, s3, v6 +; FLAT-NEXT: v_and_b32_e32 v7, s3, v7 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] @@ -581,6 +605,8 @@ ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: v_or_b32_e32 v0, v6, v4 ; FLAT-NEXT: v_or_b32_e32 v1, v7, v5 +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: s_nop 0 ; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 @@ -675,33 +701,35 @@ ; FLAT-LABEL: v_brev_v2i64: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; FLAT-NEXT: s_mov_b32 s2, 0x10203 -; FLAT-NEXT: s_mov_b32 s3, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s0, 0x10203 +; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f ; FLAT-NEXT: s_mov_b32 s8, 0xf0f0f0f0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0x33333333 -; FLAT-NEXT: s_mov_b32 s1, 0xcccccccc +; FLAT-NEXT: s_mov_b32 s2, 0x33333333 +; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc ; FLAT-NEXT: s_mov_b32 s9, 0x55555555 ; FLAT-NEXT: s_mov_b32 s10, 0xaaaaaaaa ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; FLAT-NEXT: v_perm_b32 v6, 0, v0, s2 -; FLAT-NEXT: v_perm_b32 v4, 0, v3, s2 -; FLAT-NEXT: v_perm_b32 v2, 0, v2, s2 -; FLAT-NEXT: v_perm_b32 v8, 0, v1, s2 -; FLAT-NEXT: v_and_b32_e32 v1, s3, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s3, v4 +; FLAT-NEXT: v_perm_b32 v6, 0, v0, s0 +; FLAT-NEXT: v_perm_b32 v4, 0, v3, s0 +; FLAT-NEXT: v_perm_b32 v2, 0, v2, s0 +; FLAT-NEXT: v_perm_b32 v8, 0, v1, s0 +; FLAT-NEXT: v_and_b32_e32 v1, s1, v2 +; FLAT-NEXT: v_and_b32_e32 v0, s1, v4 ; FLAT-NEXT: v_and_b32_e32 v3, s8, v2 ; FLAT-NEXT: v_and_b32_e32 v2, s8, v4 -; FLAT-NEXT: v_and_b32_e32 v5, s3, v6 -; FLAT-NEXT: v_and_b32_e32 v4, s3, v8 +; FLAT-NEXT: v_and_b32_e32 v5, s1, v6 +; FLAT-NEXT: v_and_b32_e32 v4, s1, v8 ; FLAT-NEXT: v_and_b32_e32 v7, s8, v6 ; FLAT-NEXT: v_and_b32_e32 v6, s8, v8 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] @@ -712,14 +740,14 @@ ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 ; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 ; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 -; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s0, v2 -; FLAT-NEXT: v_and_b32_e32 v5, s0, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s0, v6 -; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s1, v2 -; FLAT-NEXT: v_and_b32_e32 v7, s1, v7 -; FLAT-NEXT: v_and_b32_e32 v6, s1, v6 +; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 +; FLAT-NEXT: v_and_b32_e32 v5, s2, v7 +; FLAT-NEXT: v_and_b32_e32 v4, s2, v6 +; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s3, v2 +; FLAT-NEXT: v_and_b32_e32 v7, s3, v7 +; FLAT-NEXT: v_and_b32_e32 v6, s3, v6 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] @@ -744,6 +772,8 @@ ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 ; FLAT-NEXT: v_or_b32_e32 v1, v7, v5 ; FLAT-NEXT: v_or_b32_e32 v0, v6, v4 +; FLAT-NEXT: s_nop 0 +; FLAT-NEXT: s_nop 0 ; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir b/llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir --- a/llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir +++ b/llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir @@ -1,5 +1,5 @@ # RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s -# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s +# RUN: llc -march=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s --- # Trivial clause at beginning of program diff --git a/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir --- a/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir +++ b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir @@ -2,7 +2,7 @@ # Make sure the default assumption is xnack enabled with no cpu # RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+volcanic-islands -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s -# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s +# RUN: llc -march=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s --- # Trivial clause at beginning of program name: trivial_clause_load_flat4_x1 diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -43,6 +43,8 @@ ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_perm_b32 v0, 0, s4, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %val = load i32, i32 addrspace(1)* %in, align 4 @@ -83,6 +85,8 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_perm_b32 v1, 0, s5, v0 ; VI-NEXT: v_perm_b32 v0, 0, s4, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 @@ -131,6 +135,8 @@ ; VI-NEXT: v_perm_b32 v2, 0, s6, v0 ; VI-NEXT: v_perm_b32 v1, 0, s5, v0 ; VI-NEXT: v_perm_b32 v0, 0, s4, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 @@ -196,6 +202,8 @@ ; VI-NEXT: v_perm_b32 v6, 0, s6, v4 ; VI-NEXT: v_perm_b32 v5, 0, s5, v4 ; VI-NEXT: v_perm_b32 v4, 0, s4, v4 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: s_endpgm @@ -237,6 +245,8 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_perm_b32 v1, 0, s4, v0 ; VI-NEXT: v_perm_b32 v0, 0, s5, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %val = load i64, i64 addrspace(1)* %in, align 8 @@ -285,6 +295,8 @@ ; VI-NEXT: v_perm_b32 v2, 0, s7, v0 ; VI-NEXT: v_perm_b32 v1, 0, s4, v0 ; VI-NEXT: v_perm_b32 v0, 0, s5, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 @@ -350,6 +362,8 @@ ; VI-NEXT: v_perm_b32 v6, 0, s7, v4 ; VI-NEXT: v_perm_b32 v5, 0, s4, v4 ; VI-NEXT: v_perm_b32 v4, 0, s5, v4 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-xnack -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s %struct.ByValStruct = type { [4 x i32] } diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global,-xnack -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s ; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,HSA %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global,-xnack -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global,-xnack -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,HSA %s declare hidden void @external_void_func_i1(i1) #0 declare hidden void @external_void_func_i1_signext(i1 signext) #0 diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3,-xnack -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3,-xnack -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3,-xnack -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s ; Make sure to run a GPU with the SGPR allocation bug. diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -35,12 +35,14 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: global_store_dword v[0:1], v2, off -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+4 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: global_store_dword v[0:1], v2, off +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm store i32 0, i32 addrspace(1)* %ptr @@ -65,6 +67,8 @@ ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dword v[0:1], v32, off ; GCN-NEXT: s_endpgm call void @func(i32 0) @@ -87,6 +91,8 @@ ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dword v[1:2], v0, off ; GCN-NEXT: s_endpgm %rv = call i32 @func.return(i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}callee_no_stack: ; GCN: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -6,7 +6,9 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:2 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s33 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -27,7 +29,9 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], s33 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -46,6 +50,8 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s33 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 @@ -107,10 +113,14 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 2 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_ushort v0, v[0:1], off ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_short_d16_hi v0, v[1:2], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -131,7 +141,9 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_ushort v0, v[0:1], off +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -151,10 +163,14 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 2 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: flat_load_ushort v0, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: flat_load_short_d16_hi v0, v[1:2] ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -175,7 +191,9 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_load_ushort v0, v[0:1] +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: flat_load_short_d16_hi v0, v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -199,26 +217,40 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: global_load_ushort v4, v[2:3], off ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: global_load_ushort v4, v[2:3], off +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:4 ; GCN-NEXT: global_load_ushort v4, v[2:3], off offset:2 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:6 ; GCN-NEXT: global_load_ushort v2, v[2:3], off offset:4 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_store_short v2, off, s[0:3], s9 offset:8 ; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], s9 offset:4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], s9 offset:6 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v3, v4 -; GCN-NEXT: buffer_load_short_d16_hi v3, off, s[0:3], s9 offset:8 ; GCN-NEXT: v_lshl_or_b32 v2, v4, 16, v2 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_short_d16_hi v3, off, s[0:3], s9 offset:8 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GCN-NEXT: s_endpgm entry: @@ -301,6 +333,8 @@ ; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s33 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s33 offen offset:2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 @@ -321,6 +355,8 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_short_d16_hi v0, v[0:1], off ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -343,6 +379,8 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_load_ushort v2, v[0:1] offset:2 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: flat_load_short_d16_hi v0, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir --- a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir +++ b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=tonga -run-pass post-RA-sched -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=tonga -mattr=-xnack -run-pass post-RA-sched -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s # GCN: FLAT_LOAD_DWORD # GCN-NEXT: FLAT_LOAD_DWORD diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll --- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll @@ -15,7 +15,7 @@ ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 12 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel @@ -33,7 +33,7 @@ ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 12 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel diff --git a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll --- a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll @@ -35,6 +35,8 @@ ; VI-NEXT: s_mov_b32 s5, s0 ; VI-NEXT: s_mov_b32 s6, s0 ; VI-NEXT: s_mov_b32 s7, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm ; VI-NEXT: v_and_b32_e32 v0, 7, v0 ; VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICI,SI %s ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICI %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-xnack < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s ; GCN-LABEL: {{^}}load_i32: ; GCN-DAG: s_mov_b32 s3, 0 diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll --- a/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}fold_mi_v_and_0: ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-sdwa-peephole=0 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global,-xnack -amdgpu-sdwa-peephole=0 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -28,6 +28,8 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v32, s34, 2 @@ -44,6 +46,7 @@ ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s34, v32, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -62,6 +65,8 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v32, s34, 2 @@ -78,6 +83,7 @@ ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s34, v32, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -96,6 +102,8 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v32, s34, 2 @@ -112,6 +120,7 @@ ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s34, v32, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -130,6 +139,8 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v32, s34, 2 @@ -147,6 +158,7 @@ ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s34, v32, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -190,6 +202,8 @@ ; GCN-NEXT: s_addc_u32 s5, s5, func_v3i16@rel32@hi+4 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: BB4_3: ; %if.end +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_short v[0:1], v1, off ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_endpgm @@ -234,6 +248,8 @@ ; GCN-NEXT: s_addc_u32 s5, s5, func_v3f16@rel32@hi+4 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: BB5_3: ; %if.end +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_short v[0:1], v1, off ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -35,14 +35,16 @@ ; VI-LABEL: s_ctlz_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s1, s0 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0 +; VI-NEXT: s_flbit_i32_b32 s0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -85,19 +87,23 @@ ; VI-LABEL: v_ctlz_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -154,14 +160,16 @@ ; VI-LABEL: v_ctlz_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v1 @@ -170,6 +178,8 @@ ; VI-NEXT: v_ffbh_u32_e32 v3, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -235,14 +245,16 @@ ; VI-LABEL: v_ctlz_v4i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v4, v3 @@ -257,6 +269,8 @@ ; VI-NEXT: v_ffbh_u32_e32 v7, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -319,19 +333,23 @@ ; VI-LABEL: v_ctlz_i8: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 ; VI-NEXT: v_add_u16_e32 v0, -8, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -395,21 +413,23 @@ ; VI-LABEL: s_ctlz_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s0 -; VI-NEXT: s_flbit_i32_b32 s3, s1 -; VI-NEXT: s_add_i32 s2, s2, 32 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_flbit_i32_b32 s0, s2 +; VI-NEXT: s_flbit_i32_b32 s1, s3 +; VI-NEXT: s_add_i32 s0, s0, 32 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; VI-NEXT: s_or_b32 s0, s2, s3 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -459,20 +479,22 @@ ; VI-LABEL: s_ctlz_i64_trunc: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s0 -; VI-NEXT: s_flbit_i32_b32 s3, s1 -; VI-NEXT: s_add_i32 s2, s2, 32 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_flbit_i32_b32 s0, s2 +; VI-NEXT: s_flbit_i32_b32 s1, s3 +; VI-NEXT: s_add_i32 s0, s0, 32 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; VI-NEXT: s_or_b32 s0, s2, s3 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -526,18 +548,20 @@ ; VI-LABEL: v_ctlz_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v6, s3 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v2 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 @@ -547,6 +571,8 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm ; @@ -613,18 +639,20 @@ ; VI-LABEL: v_ctlz_i64_trunc: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v1 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v3, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v4, v0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v4 @@ -634,6 +662,8 @@ ; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 64, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; @@ -694,17 +724,21 @@ ; VI-LABEL: v_ctlz_i32_sel_eq_neg1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -760,17 +794,21 @@ ; VI-LABEL: v_ctlz_i32_sel_ne_neg1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -831,14 +869,16 @@ ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 @@ -846,6 +886,8 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -907,14 +949,16 @@ ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 @@ -922,6 +966,8 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -978,16 +1024,20 @@ ; VI-LABEL: v_ctlz_i8_sel_eq_neg1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -1047,13 +1097,15 @@ ; VI-LABEL: v_ctlz_i16_sel_eq_neg1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 @@ -1061,6 +1113,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0xffff ; VI-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -1120,17 +1174,21 @@ ; VI-LABEL: v_ctlz_i7_sel_eq_neg1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s declare i16 @llvm.ctpop.i16(i16) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -25,16 +25,20 @@ ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -67,18 +71,22 @@ ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -113,19 +121,23 @@ ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -160,20 +172,24 @@ ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -220,13 +236,13 @@ ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -234,18 +250,22 @@ ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_ubyte v8, v[2:3] -; VI-NEXT: flat_load_ubyte v2, v[4:5] -; VI-NEXT: flat_load_ubyte v3, v[6:7] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_ubyte v9, v[4:5] +; VI-NEXT: flat_load_ubyte v10, v[6:7] +; VI-NEXT: flat_load_ubyte v11, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v8 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v9 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v10 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v11 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -304,37 +324,44 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_mov_b32_e32 v4, 9 -; VI-NEXT: s_movk_i32 s8, 0x900 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v4, 9 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_movk_i32 s0, 0x900 +; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v6, s8 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v5 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v5 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v5 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v5 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: v_and_b32_e32 v8, 0xffffff00, v5 ; VI-NEXT: v_add_u16_e32 v9, 9, v5 ; VI-NEXT: v_add_u16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v7 ; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, s8, v0 +; VI-NEXT: v_add_u16_e32 v0, s0, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -390,18 +417,20 @@ ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_ubyte v10, v[2:3] ; VI-NEXT: flat_load_ubyte v11, v[4:5] ; VI-NEXT: v_add_u32_e32 v2, vcc, 6, v0 @@ -412,24 +441,31 @@ ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v8, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] -; VI-NEXT: flat_load_ubyte v5, v[8:9] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_ubyte v16, v[2:3] +; VI-NEXT: flat_load_ubyte v12, v[4:5] +; VI-NEXT: flat_load_ubyte v13, v[6:7] +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_ubyte v14, v[8:9] +; VI-NEXT: flat_load_ubyte v15, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v10 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) +; VI-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 ; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v2 -; VI-NEXT: v_or_b32_sdwa v2, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v16 +; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v12 +; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v13 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v4 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v3 -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v14 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v15 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -470,14 +506,16 @@ ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dwordx2 v[7:8], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 @@ -488,6 +526,8 @@ ; VI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -521,18 +561,22 @@ ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -566,17 +610,21 @@ ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -611,16 +659,20 @@ ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -664,13 +716,13 @@ ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -678,19 +730,23 @@ ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_ubyte v8, v[2:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_ubyte v9, v[4:5] +; VI-NEXT: flat_load_ubyte v10, v[6:7] +; VI-NEXT: flat_load_ubyte v11, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v4 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v11 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v10 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -723,17 +779,21 @@ ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -766,17 +826,21 @@ ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -810,17 +874,21 @@ ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -854,17 +922,21 @@ ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -908,13 +980,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; VI-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll --- a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll @@ -8,6 +8,8 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: flat_store_dword v[0:1], v0 ; GCN-NEXT: s_endpgm %c1 = fcmp olt float %x, 1.0 @@ -25,6 +27,8 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 4.0, 2.0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: flat_store_dword v[0:1], v0 ; GCN-NEXT: s_endpgm %c1 = fcmp olt float %x, 1.0 @@ -42,6 +46,8 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 4.0, 2.0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: flat_store_dword v[0:1], v0 ; GCN-NEXT: s_endpgm %c1 = fcmp olt float %x, 1.0 @@ -59,6 +65,8 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: flat_store_dword v[0:1], v0 ; GCN-NEXT: s_endpgm %c1 = fcmp olt float %x, 1.0 @@ -73,6 +81,8 @@ ; GCN-LABEL: different_constants: ; GCN: ; %bb.0: ; GCN-NEXT: v_mov_b32_e32 v0, 2.0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: flat_store_dword v[0:1], v0 ; GCN-NEXT: s_endpgm %c1 = fcmp olt float %x, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll --- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll +++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll @@ -27,18 +27,25 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX810 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=stoney -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX810 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+code-object-v3,+xnack < %s | FileCheck --check-prefixes=GFX900 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX902 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX904 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX906 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx909 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX909 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX1010 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1011 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX1011 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1012 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX1012 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+code-object-v3,+xnack < %s | FileCheck --check-prefixes=XNACK-GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=+code-object-v3,-xnack < %s | FileCheck --check-prefixes=NO-XNACK-GFX803 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+code-object-v3,-xnack < %s | FileCheck --check-prefixes=NO-XNACK-GFX900 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=+code-object-v3,-xnack < %s | FileCheck --check-prefixes=NO-XNACK-GFX902 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,-xnack < %s | FileCheck --check-prefixes=NO-XNACK-GFX1010 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+code-object-v3,+sram-ecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX904 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+code-object-v3,+sram-ecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s - -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+code-object-v3,+sram-ecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX904 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+code-object-v3,+sram-ecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+code-object-v3,-sram-ecc < %s | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+code-object-v3,-sram-ecc < %s | FileCheck --check-prefixes=NO-SRAM-ECC-GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+code-object-v3,-sram-ecc,-xnack < %s | FileCheck --check-prefixes=NO-XNACK-NO-SRAM-ECC-GFX906 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+code-object-v3,-sram-ecc,-xnack < %s | FileCheck --check-prefixes=NO-XNACK-NO-SRAM-ECC-GFX908 %s ; GFX600: .amdgcn_target "amdgcn-amd-amdhsa--gfx600" ; GFX601: .amdgcn_target "amdgcn-amd-amdhsa--gfx601" @@ -48,22 +55,28 @@ ; GFX703: .amdgcn_target "amdgcn-amd-amdhsa--gfx703" ; GFX704: .amdgcn_target "amdgcn-amd-amdhsa--gfx704" ; GFX801: .amdgcn_target "amdgcn-amd-amdhsa--gfx801+xnack" -; GFX802: .amdgcn_target "amdgcn-amd-amdhsa--gfx802" -; GFX803: .amdgcn_target "amdgcn-amd-amdhsa--gfx803" +; GFX802: .amdgcn_target "amdgcn-amd-amdhsa--gfx802+xnack" +; GFX803: .amdgcn_target "amdgcn-amd-amdhsa--gfx803+xnack" ; GFX810: .amdgcn_target "amdgcn-amd-amdhsa--gfx810+xnack" -; GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900" +; GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900+xnack" ; GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902+xnack" -; GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904" -; GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906" +; GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack" +; GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack+sram-ecc" +; GFX908: .amdgcn_target "amdgcn-amd-amdhsa--gfx908+xnack+sram-ecc" +; GFX909: .amdgcn_target "amdgcn-amd-amdhsa--gfx909+xnack" +; GFX1010: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010+xnack" +; GFX1011: .amdgcn_target "amdgcn-amd-amdhsa--gfx1011+xnack" +; GFX1012: .amdgcn_target "amdgcn-amd-amdhsa--gfx1012+xnack" -; XNACK-GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900+xnack" +; NO-XNACK-GFX803: .amdgcn_target "amdgcn-amd-amdhsa--gfx803" +; NO-XNACK-GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900" ; NO-XNACK-GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902" +; NO-XNACK-GFX1010: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010" -; SRAM-ECC-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+sram-ecc" -; SRAM-ECC-GFX906: "amdgcn-amd-amdhsa--gfx906+sram-ecc" - -; SRAM-ECC-XNACK-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack+sram-ecc" -; SRAM-ECC-XNACK-GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack+sram-ecc" +; NO-SRAM-ECC-GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack" +; NO-SRAM-ECC-GFX908: .amdgcn_target "amdgcn-amd-amdhsa--gfx908+xnack" +; NO-XNACK-NO-SRAM-ECC-GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906" +; NO-XNACK-NO-SRAM-ECC-GFX908: .amdgcn_target "amdgcn-amd-amdhsa--gfx908" define amdgpu_kernel void @directive_amdgcn_target() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll @@ -1,44 +1,36 @@ -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX902 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 -mattr=-sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX902 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 -mattr=+sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX902 %s - -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=-sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc,+xnack < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=-sram-ecc,-xnack < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-NO-XNACK-GFX906 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx908 < %s | llvm-readobj -file-headers - | FileCheck --check-prefix=SRAM-ECC-GFX908 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx908 < %s | llvm-readobj -file-headers - | FileCheck --check-prefix=SRAM-ECC-XNACK-GFX908 %s -; NO-SRAM-ECC-GFX902: Flags [ -; NO-SRAM-ECC-GFX902-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D) -; NO-SRAM-ECC-GFX902-NEXT: EF_AMDGPU_XNACK (0x100) -; NO-SRAM-ECC-GFX902-NEXT: ] +; RUN: not --crash llc -march=amdgcn -mcpu=gfx900 -mattr=+sram-ecc < %s 2>&1 | FileCheck --check-prefixes=ERR-SRAM-ECC-GFX900 %s -; SRAM-ECC-GFX902: Flags [ -; SRAM-ECC-GFX902-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D) -; SRAM-ECC-GFX902-NEXT: EF_AMDGPU_SRAM_ECC (0x200) -; SRAM-ECC-GFX902-NEXT: EF_AMDGPU_XNACK (0x100) -; SRAM-ECC-GFX902-NEXT: ] +; SRAM-ECC-XNACK-GFX906: Flags [ +; SRAM-ECC-XNACK-GFX906-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) +; SRAM-ECC-XNACK-GFX906-NEXT: EF_AMDGPU_SRAM_ECC (0x200) +; SRAM-ECC-XNACK-GFX906-NEXT: EF_AMDGPU_XNACK (0x100) +; SRAM-ECC-XNACK-GFX906-NEXT: ] ; NO-SRAM-ECC-GFX906: Flags [ ; NO-SRAM-ECC-GFX906-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) +; NO-SRAM-ECC-GFX906-NOT: EF_AMDGPU_SRAM_ECC (0x200) +; NO-SRAM-ECC-GFX906-NEXT: EF_AMDGPU_XNACK (0x100) ; NO-SRAM-ECC-GFX906-NEXT: ] -; SRAM-ECC-GFX906: Flags [ -; SRAM-ECC-GFX906-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) -; SRAM-ECC-GFX906-NEXT: EF_AMDGPU_SRAM_ECC (0x200) -; SRAM-ECC-GFX906-NEXT: ] +; NO-SRAM-ECC-NO-XNACK-GFX906: Flags [ +; NO-SRAM-ECC-NO-XNACK-GFX906-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) +; NO-SRAM-ECC-NO-XNACK-GFX906-NOT: EF_AMDGPU_SRAM_ECC (0x200) +; NO-SRAM-ECC-NO-XNACK-GFX906-NOT: EF_AMDGPU_XNACK (0x100) +; NO-SRAM-ECC-NO-XNACK-GFX906-NEXT: ] -; SRAM-ECC-XNACK-GFX906: Flags [ -; SRAM-ECC-XNACK-GFX906-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) -; SRAM-ECC-XNACK-GFX906-NEXT: EF_AMDGPU_SRAM_ECC (0x200) -; SRAM-ECC-XNACK-GFX906-NEXT: EF_AMDGPU_XNACK (0x100) -; SRAM-ECC-XNACK-GFX906-NEXT: ] +; SRAM-ECC-XNACK-GFX908: Flags [ +; SRAM-ECC-XNACK-GFX908-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX908 (0x30) +; SRAM-ECC-XNACK-GFX908-NEXT: EF_AMDGPU_SRAM_ECC (0x200) +; SRAM-ECC-XNACK-GFX908-NEXT: EF_AMDGPU_XNACK (0x100) +; SRAM-ECC-XNACK-GFX908-NEXT: ] -; SRAM-ECC-GFX908: Flags [ (0x230) -; SRAM-ECC-GFX908: EF_AMDGPU_MACH_AMDGCN_GFX908 (0x30) -; SRAM-ECC-GFX908: EF_AMDGPU_SRAM_ECC (0x200) -; SRAM-ECC-GFX908: ] +; ERR-SRAM-ECC-GFX900: LLVM ERROR: SRAM ECC was requested for a processor that does not support it! define amdgpu_kernel void @elf_header() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-xnack.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-xnack.ll --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-xnack.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-xnack.ll @@ -1,5 +1,6 @@ ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx801 -mattr=-xnack < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-XNACK-GFX801 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx802 -mattr=+xnack < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=XNACK-GFX802 %s +; RUN: not --crash llc -filetype=obj -march=amdgcn -mcpu=gfx600 -mattr=+xnack < %s 2>&1 | FileCheck --check-prefixes=ERR-XNACK-GFX600 %s ; NO-XNACK-GFX801: Flags [ ; NO-XNACK-GFX801-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX801 (0x28) @@ -10,6 +11,8 @@ ; XNACK-GFX802-NEXT: EF_AMDGPU_XNACK (0x100) ; XNACK-GFX802-NEXT: ] +; ERR-XNACK-GFX600: LLVM ERROR: XNACK was requested for a processor that does not support it! + define amdgpu_kernel void @elf_header() { ret void } diff --git a/llvm/test/CodeGen/AMDGPU/elf-notes.ll b/llvm/test/CodeGen/AMDGPU/elf-notes.ll --- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll @@ -11,14 +11,14 @@ ; OSABI-UNK-NOT: .hsa_code_object_version ; OSABI-UNK-NOT: .hsa_code_object_isa -; OSABI-UNK: .amd_amdgpu_isa "amdgcn-amd-unknown--gfx802" +; OSABI-UNK: .amd_amdgpu_isa "amdgcn-amd-unknown--gfx802+xnack" ; OSABI-UNK-NOT: .amd_amdgpu_hsa_metadata ; OSABI-UNK-NOT: .amd_amdgpu_pal_metadata ; OSABI-UNK-ELF-NOT: Unknown note type ; OSABI-UNK-ELF: NT_AMD_AMDGPU_ISA (ISA Version) ; OSABI-UNK-ELF: ISA Version: -; OSABI-UNK-ELF: amdgcn-amd-unknown--gfx802 +; OSABI-UNK-ELF: amdgcn-amd-unknown--gfx802+xnack ; OSABI-UNK-ELF-NOT: Unknown note type ; OSABI-UNK-ELF-NOT: NT_AMD_AMDGPU_HSA_METADATA (HSA Metadata) ; OSABI-UNK-ELF-NOT: Unknown note type @@ -27,7 +27,7 @@ ; OSABI-HSA: .hsa_code_object_version ; OSABI-HSA: .hsa_code_object_isa -; OSABI-HSA: .amd_amdgpu_isa "amdgcn-amd-amdhsa--gfx802" +; OSABI-HSA: .amd_amdgpu_isa "amdgcn-amd-amdhsa--gfx802+xnack" ; OSABI-HSA: .amd_amdgpu_hsa_metadata ; OSABI-HSA-NOT: .amd_amdgpu_pal_metadata @@ -35,7 +35,7 @@ ; OSABI-HSA-ELF: Unknown note type (0x00000003) ; OSABI-HSA-ELF: NT_AMD_AMDGPU_ISA (ISA Version) ; OSABI-HSA-ELF: ISA Version: -; OSABI-HSA-ELF: amdgcn-amd-amdhsa--gfx802 +; OSABI-HSA-ELF: amdgcn-amd-amdhsa--gfx802+xnack ; OSABI-HSA-ELF: NT_AMD_AMDGPU_HSA_METADATA (HSA Metadata) ; OSABI-HSA-ELF: HSA Metadata: ; OSABI-HSA-ELF: --- @@ -55,14 +55,14 @@ ; OSABI-PAL-NOT: .hsa_code_object_version ; OSABI-PAL: .hsa_code_object_isa -; OSABI-PAL: .amd_amdgpu_isa "amdgcn-amd-amdpal--gfx802" +; OSABI-PAL: .amd_amdgpu_isa "amdgcn-amd-amdpal--gfx802+xnack" ; OSABI-PAL-NOT: .amd_amdgpu_hsa_metadata ; OSABI-PAL: .amd_amdgpu_pal_metadata ; OSABI-PAL-ELF: Unknown note type (0x00000003) ; OSABI-PAL-ELF: NT_AMD_AMDGPU_ISA (ISA Version) ; OSABI-PAL-ELF: ISA Version: -; OSABI-PAL-ELF: amdgcn-amd-amdpal--gfx802 +; OSABI-PAL-ELF: amdgcn-amd-amdpal--gfx802+xnack ; OSABI-PAL-ELF-NOT: NT_AMD_AMDGPU_HSA_METADATA (HSA Metadata) ; OSABI-PAL-ELF: NT_AMD_AMDGPU_PAL_METADATA (PAL Metadata) ; TODO: readobj can no longer dump PAL metadata pending resolution of D52821 diff --git a/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll b/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll --- a/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll @@ -98,5 +98,5 @@ attributes #0 = { nounwind "target-cpu"="tahiti" } attributes #1 = { nounwind "target-cpu"="bonaire" } -attributes #2 = { nounwind "target-cpu"="iceland" } -attributes #3 = { nounwind "target-cpu"="fiji" } +attributes #2 = { nounwind "target-cpu"="iceland" "target-features"="-xnack"} +attributes #3 = { nounwind "target-cpu"="fiji" "target-features"="-xnack"} diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; How the replacement of i64 stores with v2i32 stores resulted in ; breaking other users of the bitcast if they already existed diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -33,9 +33,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ushort v3, v[0:1], off offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 2 @@ -89,6 +89,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: global_store_short v[0:1], v3, off offset:2 ; GFX9-NEXT: s_endpgm @@ -196,6 +198,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1 @@ -286,6 +290,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -33,9 +33,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], s33 offen -; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen offset:2 +; GFX9-NEXT: buffer_load_ushort v2, v0, s[0:3], s33 offen offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 2 @@ -75,8 +75,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], s33 offen ; GFX9-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], s33 offen offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -167,6 +171,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -231,13 +237,31 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GCN-LABEL: private_store_2xi16_align4: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0x20001 -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX7-ALIGNED-LABEL: private_store_2xi16_align4: +; GFX7-ALIGNED: ; %bb.0: +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX7-ALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-UNALIGNED-LABEL: private_store_2xi16_align4: +; GFX7-UNALIGNED: ; %bb.0: +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: private_store_2xi16_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 4 store i16 2, i16 addrspace(5)* %gep.r, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=stoney -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll @@ -34,13 +34,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -86,13 +90,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -138,13 +146,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -190,13 +202,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll @@ -32,13 +32,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -84,13 +88,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -136,13 +144,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ngt_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -188,13 +200,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_nge_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -240,13 +256,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -292,13 +312,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -344,13 +368,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_le_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -396,13 +424,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -1,8 +1,8 @@ ; RUN: llc -march=amdgcn -mcpu=hawaii -start-after=sink -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=fiji -start-after=sink --verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=fiji -start-after=sink -mattr=-xnack -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -start-after=sink -mattr=-xnack -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=VI -check-prefix=FUNC %s ; -------------------------------------------------------------------------------- ; fadd tests diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -1,5 +1,5 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -341,7 +341,7 @@ ; FIXME: Should not scalarize ; GCN-LABEL: {{^}}v5i16_func_void: ; GFX9: buffer_load_dwordx2 v[0:1] -; GFX9-NEXT: global_load_short_d16 v2 +; GFX9: global_load_short_d16 v2 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 define <5 x i16> @v5i16_func_void() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr.ll b/llvm/test/CodeGen/AMDGPU/global-saddr.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr.ll @@ -86,7 +86,7 @@ ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:16{{$}} -; GFX9-NEXT: s_waitcnt +; GFX9: s_waitcnt ; NGFX9-NOT: global_load_dword define amdgpu_cs void @_amdgpu_cs_main(i64 inreg %arg) { diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll b/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll --- a/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll +++ b/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -mattr=-xnack -amdgpu-scalarize-global-loads=true -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: %bb22 diff --git a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir @@ -1,6 +1,6 @@ # RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GCX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s # GCN-LABEL: name: break_smem_clause_simple_load_smrd8_ptr_hidden_bundle # GCN: bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir @@ -1,6 +1,6 @@ # RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s # GCN-LABEL: name: break_smem_clause_max_look_ahead_in_bundle # GCN: S_LOAD_DWORDX2_IMM diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -1,7 +1,7 @@ ; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=WAVE64 --check-prefix=NOTES %s -; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=WAVE64 --check-prefix=NOTES %s -; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=WAVE64 --check-prefix=NOTES %s -; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX1010 --check-prefix=WAVE32 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-xnack -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=WAVE64 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-xnack -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=WAVE64 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX1010 --check-prefix=WAVE32 --check-prefix=NOTES %s @var = addrspace(1) global float 0.0 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3,-xnack -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3,-xnack -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s @var = addrspace(1) global float 0.0 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -44,6 +44,8 @@ ; GFX9-NEXT: s_addc_u32 s5, s5, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[1:2], v3, off ; GFX9-NEXT: s_cbranch_scc0 BB0_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -112,6 +114,8 @@ ; GFX9-NEXT: s_addc_u32 s5, s5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[1:2], v3, off ; GFX9-NEXT: s_cbranch_scc0 BB1_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -176,6 +180,8 @@ ; GFX9-NEXT: s_addc_u32 s5, s5, 0 ; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v3 ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[1:2], v3, off ; GFX9-NEXT: s_cbranch_scc0 BB2_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -238,6 +244,8 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] ; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[1:2], v3, off ; GFX9-NEXT: s_cbranch_scc0 BB3_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -289,6 +297,8 @@ ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 ; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v8, s[0:1] ; GFX9-NEXT: s_and_b64 vcc, exec, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[5:6], v2, off ; GFX9-NEXT: s_cbranch_vccz BB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -342,6 +352,8 @@ ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s6, v4 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[5:6], v2, off ; GFX9-NEXT: s_cbranch_vccz BB5_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -397,6 +409,8 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] ; GFX9-NEXT: v_add_u32_e32 v2, v8, v2 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[5:6], v2, off ; GFX9-NEXT: s_cbranch_vccz BB6_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -454,6 +468,8 @@ ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc ; GFX9-NEXT: v_sub_u32_e32 v2, v7, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[5:6], v2, off ; GFX9-NEXT: s_cbranch_vccz BB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -38,82 +38,92 @@ ; GFX8-LABEL: udot2: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -173,95 +183,103 @@ ; GFX8-LABEL: udot2_MulMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, s5, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s9, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mul_u32_u24_e32 v0, s2, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_MulMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v0, s5, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_add_u32_e32 v2, s9, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v0, s2, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_add_u32_e32 v2, s5, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MulMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v0, s5, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_add_u32_e32 v2, s9, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v0, s2, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_add_u32_e32 v2, s5, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MulMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s2, s6 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s3, v0 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s5, s4 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s9, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -315,80 +333,92 @@ ; GFX8-LABEL: idot2: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s6, s3 -; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 16 +; GFX8-NEXT: s_sext_i32_i16 s5, s1 +; GFX8-NEXT: s_ashr_i32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s1, s0, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -443,91 +473,103 @@ ; GFX8-LABEL: idot2_MixedTypedMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s6, s3 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_mad_u32_u24 v0, s3, v1, v0 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_sext_i32_i16 s5, s1 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2_MixedTypedMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedTypedMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedTypedMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-DL-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 +; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -584,82 +626,92 @@ ; GFX8-LABEL: udot2_alt_AddOperands: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_alt_AddOperands: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_alt_AddOperands: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -714,91 +766,103 @@ ; GFX8-LABEL: idot2_MixedExt: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s6, s3, 0xffff -; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s1, 0xffff +; GFX8-NEXT: s_ashr_i32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2_MixedExt: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_and_b32 s6, s3, 0xffff -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, 0xffff +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedExt: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_and_b32 s6, s3, 0xffff -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s1, 0xffff +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedExt: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_ashr_i32 s5, s2, 16 -; GFX10-DL-NEXT: s_ashr_i32 s6, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-DL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 +; GFX10-DL-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -849,76 +913,88 @@ ; GFX8-LABEL: notudot2_SameVec: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: s_and_b32 s1, s4, 0xffff -; GFX8-NEXT: v_mad_u32_u24 v2, s0, s0, v2 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: v_mad_u32_u24 v2, s1, s1, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, s0, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notudot2_SameVec: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: s_and_b32 s1, s4, 0xffff -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NODL-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, s1, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_SameVec: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 0xffff -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, s1, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_SameVec: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-DL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, s4 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s1, s8 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -975,82 +1051,92 @@ ; GFX8-LABEL: udot2_v4i16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_v4i16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1107,82 +1193,92 @@ ; GFX8-LABEL: udot2_v4i16_Hi: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x4 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x4 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_v4i16_Hi: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x4 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x4 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16_Hi: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x4 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x4 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1239,95 +1335,103 @@ ; GFX8-LABEL: notudot2_v4i16_Even: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s8, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s10, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s3, s8 -; GFX8-NEXT: s_and_b32 s2, s2, s8 -; GFX8-NEXT: s_and_b32 s5, s5, s8 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_and_b32 s1, s1, s10 +; GFX8-NEXT: s_and_b32 s0, s0, s10 +; GFX8-NEXT: s_and_b32 s5, s9, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX8-NEXT: s_and_b32 s4, s4, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: s_and_b32 s4, s8, s10 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notudot2_v4i16_Even: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-NODL-NEXT: s_and_b32 s2, s2, s8 -; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-NODL-NEXT: s_and_b32 s0, s0, s10 +; GFX9-NODL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX9-NODL-NEXT: s_and_b32 s4, s4, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: s_and_b32 s4, s8, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Even: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-DL-NEXT: s_and_b32 s2, s2, s8 -; GFX9-DL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-DL-NEXT: s_and_b32 s0, s0, s10 +; GFX9-DL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX9-DL-NEXT: s_and_b32 s4, s4, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: s_and_b32 s4, s8, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Even: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s10, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s3, s3, s8 -; GFX10-DL-NEXT: s_and_b32 s5, s5, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s2, s2, s8 -; GFX10-DL-NEXT: s_and_b32 s4, s4, s8 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s3, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_and_b32 s1, s1, s10 +; GFX10-DL-NEXT: s_and_b32 s4, s9, s10 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s11 +; GFX10-DL-NEXT: s_and_b32 s0, s0, s10 +; GFX10-DL-NEXT: s_and_b32 s5, s8, s10 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s1, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1384,95 +1488,103 @@ ; GFX8-LABEL: notudot2_v4i16_Middle: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s8, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s10, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s3, s8 -; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s5, s5, s8 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_and_b32 s1, s1, s10 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s9, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: s_lshr_b32 s4, s8, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notudot2_v4i16_Middle: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s8, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Middle: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: s_lshr_b32 s4, s8, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Middle: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s10, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s3, s3, s8 -; GFX10-DL-NEXT: s_and_b32 s5, s5, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s3, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_and_b32 s1, s1, s10 +; GFX10-DL-NEXT: s_and_b32 s4, s9, s10 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s11 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s8, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s1, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1529,95 +1641,103 @@ ; GFX8-LABEL: notudot2_DiffIndex: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_and_b32 s1, s1, s8 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX8-NEXT: s_lshr_b32 s7, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notudot2_DiffIndex: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_DiffIndex: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_DiffIndex: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-DL-NEXT: s_and_b32 s7, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_and_b32 s2, s3, s2 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-DL-NEXT: s_and_b32 s0, s0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1675,99 +1795,107 @@ ; GFX8-LABEL: udot2_MultipleUses_add1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v1, s2, v1, v0 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v1, s5, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v0 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v0 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v0 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v0 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-DL-NEXT: s_and_b32 s0, s0, s8 +; GFX10-DL-NEXT: s_and_b32 s1, s1, s8 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1825,95 +1953,107 @@ ; GFX8-LABEL: idot2_MultipleUses_add1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s6, s3 -; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX8-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 16 +; GFX8-NEXT: s_sext_i32_i16 s5, s1 +; GFX8-NEXT: s_ashr_i32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v1, s5, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v0 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_ashr_i32 s5, s2, 16 -; GFX10-DL-NEXT: s_ashr_i32 s6, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-DL-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s3, s2, v0 +; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 +; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s1, s0, v0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1973,99 +2113,107 @@ ; GFX8-LABEL: udot2_MultipleUses_mul1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v2, v0 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v2, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v2, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v2, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v2, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v2, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s6, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s3, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s6, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s4, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2124,95 +2272,107 @@ ; GFX8-LABEL: idot2_MultipleUses_mul1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_sext_i32_i16 s6, s3 -; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX8-NEXT: v_mad_i32_i24 v0, s3, v2, v0 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_sext_i32_i16 s5, s1 +; GFX8-NEXT: s_ashr_i32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_ashr_i32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, s1, v2, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v2, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v2, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v2, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s1, v2, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX10-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s5, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s4, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2273,99 +2433,107 @@ ; GFX8-LABEL: udot2_MultipleUses_mul2: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-DL-NEXT: s_and_b32 s0, s0, s8 +; GFX10-DL-NEXT: s_and_b32 s1, s1, s8 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2424,95 +2592,107 @@ ; GFX8-LABEL: idot2_MultipleUses_mul2: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s6, s3 -; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 16 +; GFX8-NEXT: s_sext_i32_i16 s5, s1 +; GFX8-NEXT: s_ashr_i32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_ashr_i32 s5, s2, 16 -; GFX10-DL-NEXT: s_ashr_i32 s6, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-DL-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 +; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2571,65 +2751,72 @@ ; GFX8-LABEL: udot2_acc16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s1, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_lshr_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s2, s0 +; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_and_b32 s3, s2, s1 ; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_and_b32 s1, s0, s1 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_acc16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s1 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NODL-NEXT: s_and_b32 s1, s0, s1 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s1, v3, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2643,10 +2830,12 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2702,112 +2891,129 @@ ; GFX8-LABEL: notsdot2_sext8: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: flat_load_ushort v2, v[2:3] -; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0) -; GFX8-NEXT: v_bfe_i32 v1, v2, 0, 8 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_bfe_i32 v3, v0, 0, 8 -; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v4, v[2:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: flat_load_ushort v5, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v4 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v5 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX8-NEXT: v_mad_i32_i24 v0, v0, v2, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX8-NEXT: v_bfe_i32 v0, v4, 0, 8 +; GFX8-NEXT: v_bfe_i32 v1, v5, 0, 8 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, v1, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notsdot2_sext8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NODL-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NODL-NEXT: global_load_ushort v4, v[2:3], off +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: global_load_ushort v5, v[0:1], off ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_bfe_i32 v1, v2, 0, 8 -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v4 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_bfe_i32 v3, v0, 0, 8 -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v3, 8, v5 ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v0, v4, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v1, v5, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v2, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v1, v0, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notsdot2_sext8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-DL-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-DL-NEXT: global_load_ushort v4, v[2:3], off +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: global_load_ushort v5, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_i32 v1, v2, 0, 8 -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_bfe_i32 v3, v0, 0, 8 -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v3, 8, v5 ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v0, v4, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v1, v5, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v2, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v2, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v1, v0, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notsdot2_sext8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-DL-NEXT: global_load_ushort v2, v[2:3], off -; GFX10-DL-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: global_load_ushort v4, v[2:3], off +; GFX10-DL-NEXT: global_load_ushort v5, v[0:1], off +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, v0 -; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, v5 +; GFX10-DL-NEXT: v_bfe_i32 v2, v4, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v3, v5, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v1, v3, v1, s2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, v0, v2, v1 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v1, v0, s0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i8> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -41,96 +41,108 @@ ; GFX8-LABEL: idot4_acc32: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s4, s2 -; GFX8-NEXT: s_sext_i32_i8 s5, s3 -; GFX8-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX8-NEXT: s_sext_i32_i8 s4, s0 +; GFX8-NEXT: s_sext_i32_i8 s5, s1 +; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX8-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX8-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_ashr_i32 s3, s3, 24 +; GFX8-NEXT: s_ashr_i32 s1, s1, 24 ; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s4, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s0, v2, v3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s3, s4, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -218,79 +230,89 @@ ; GFX8-LABEL: idot4_acc16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_sext_i32_i8 s2, s0 +; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s1, s2 -; GFX8-NEXT: s_bfe_i32 s3, s2, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80010 -; GFX8-NEXT: s_sext_i32_i8 s1, s0 +; GFX8-NEXT: s_sext_i32_i8 s2, s1 +; GFX8-NEXT: s_bfe_i32 s5, s0, 0x80010 ; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80008 -; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80010 -; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_bfe_i32 s4, s1, 0x80008 +; GFX8-NEXT: s_bfe_i32 s3, s1, 0x80010 ; GFX8-NEXT: s_ashr_i32 s0, s0, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_ashr_i32 s1, s1, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s3, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s2 -; GFX9-NODL-NEXT: s_bfe_i32 s3, s2, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s0 +; GFX9-NODL-NEXT: s_bfe_i32 s3, s0, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s5, s0, 0x80010 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s4, s0, 0x80008 -; GFX9-NODL-NEXT: s_bfe_i32 s3, s0, 0x80010 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NODL-NEXT: s_bfe_i32 s4, s1, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s3, s1, 0x80010 ; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s1, v3, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -304,10 +326,12 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -385,81 +409,88 @@ ; GFX8-LABEL: idot4_acc8: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010 +; GFX8-NEXT: s_and_b32 s3, s2, s1 +; GFX8-NEXT: s_and_b32 s1, s0, s1 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s1, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s1 +; GFX9-NODL-NEXT: s_and_b32 s1, s0, s1 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s1, v3, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -473,10 +504,12 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -546,125 +579,137 @@ ; GFX8-LABEL: idot4_multiuse_mul1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s4, s2 -; GFX8-NEXT: s_sext_i32_i8 s5, s3 -; GFX8-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX8-NEXT: s_sext_i32_i8 s4, s0 +; GFX8-NEXT: s_sext_i32_i8 s5, s1 +; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX8-NEXT: v_mad_i32_i24 v1, s4, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX8-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX8-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX8-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_ashr_i32 s3, s3, 24 +; GFX8-NEXT: s_ashr_i32 s1, s1, 24 ; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v0, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-DL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-DL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_sext_i32_i8 s5, s2 -; GFX10-DL-NEXT: s_sext_i32_i8 s6, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80008 -; GFX10-DL-NEXT: s_bfe_i32 s7, s3, 0x80008 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s6, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s7, v0 -; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_i32 s7, s3, 0x80010 -; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s6, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s7, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x80008 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s7, v0 +; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 24 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s7, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -744,126 +789,138 @@ ; GFX8-LABEL: idot4_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX8-NEXT: s_ashr_i32 s6, s3, 24 -; GFX8-NEXT: s_bfe_i32 s7, s3, 0x80010 -; GFX8-NEXT: s_sext_i32_i8 s3, s3 -; GFX8-NEXT: s_ashr_i32 s4, s2, 24 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80010 -; GFX8-NEXT: s_sext_i32_i8 s2, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX8-NEXT: s_ashr_i32 s6, s1, 24 +; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX8-NEXT: s_sext_i32_i8 s1, s1 +; GFX8-NEXT: s_ashr_i32 s4, s0, 24 +; GFX8-NEXT: s_bfe_i32 s5, s0, 0x80010 +; GFX8-NEXT: s_sext_i32_i8 s0, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s8 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3 ; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mad_i32_i24 v0, s5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: v_mad_i32_i24 v2, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s6, s3, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s4, s2, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s6, s1, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s4, s0, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s5, s0, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX9-DL-NEXT: s_ashr_i32 s6, s3, 24 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80010 -; GFX9-DL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-DL-NEXT: s_ashr_i32 s4, s2, 24 -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x80010 -; GFX9-DL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX9-DL-NEXT: s_ashr_i32 s6, s1, 24 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s1, s1 +; GFX9-DL-NEXT: s_ashr_i32 s4, s0, 24 +; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: s_sext_i32_i8 s5, s2 -; GFX10-DL-NEXT: s_sext_i32_i8 s6, s3 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010 -; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x80010 +; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -926,46 +983,50 @@ ; GFX8-LABEL: idot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_sext_i32_i8 s2, s0 +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s1, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: s_bfe_i32 s3, s2, 0x80010 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 8, s0 -; GFX8-NEXT: s_sext_i32_i8 s1, s0 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 8, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_sext_i32_i8 s2, s1 +; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80010 ; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80010 -; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: s_bfe_i32 s4, s1, 0x80010 ; GFX8-NEXT: s_ashr_i32 s0, s0, 24 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: s_ashr_i32 s1, s1, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v4, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, v5, v3, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s4, v6, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v3, 8, s5 ; GFX9-NODL-NEXT: s_bfe_i32 s5, s5, 0x80000 ; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v2, 8, s4 @@ -974,38 +1035,42 @@ ; GFX9-NODL-NEXT: v_lshl_or_b32 v3, v3, 16, v5 ; GFX9-NODL-NEXT: v_and_b32_e32 v5, s4, v4 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 -; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v0, 8, s2 +; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s1, s1, 0x80000 +; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v0, 8, s0 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 -; GFX9-NODL-NEXT: v_and_b32_e32 v3, s3, v4 -; GFX9-NODL-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX9-NODL-NEXT: v_and_b32_e32 v3, s1, v4 +; GFX9-NODL-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 -; GFX9-NODL-NEXT: v_and_b32_e32 v3, s2, v4 +; GFX9-NODL-NEXT: v_and_b32_e32 v3, s0, v4 ; GFX9-NODL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v0, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u32_e32 v4, v3, v4 ; GFX9-NODL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u32_e32 v3, v3, v2 ; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX9-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s5 ; GFX9-DL-NEXT: s_bfe_i32 s5, s5, 0x80000 ; GFX9-DL-NEXT: v_ashrrev_i16_e64 v2, 8, s4 @@ -1014,36 +1079,42 @@ ; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v5 ; GFX9-DL-NEXT: v_and_b32_e32 v5, s4, v4 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 -; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s3 -; GFX9-DL-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX9-DL-NEXT: v_ashrrev_i16_e64 v0, 8, s2 +; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s1 +; GFX9-DL-NEXT: s_bfe_i32 s1, s1, 0x80000 +; GFX9-DL-NEXT: v_ashrrev_i16_e64 v0, 8, s0 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s3, v4 -; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s1, v4 +; GFX9-DL-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v4 ; GFX9-DL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v4, v3, v4 ; GFX9-DL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1073,6 +1144,8 @@ ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -42,98 +42,108 @@ ; GFX8-LABEL: udot4_acc32: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s2, 0xff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s5, s3, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX8-NEXT: v_mad_u32_u24 v0, s5, v0, v1 -; GFX8-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX8-NEXT: v_mad_u32_u24 v0, s4, v0, v1 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX8-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_lshr_b32 s4, s4, 24 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX8-NEXT: s_lshr_b32 s3, s3, 24 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v0, v1 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v0, v1 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s4, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v2, v3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s3, s4, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -212,81 +222,88 @@ ; GFX8-LABEL: udot4_acc16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: s_and_b32 s3, s2, s1 +; GFX8-NEXT: s_and_b32 s1, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s1, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s1 +; GFX9-NODL-NEXT: s_and_b32 s1, s0, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s1, v3, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -300,10 +317,12 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -382,81 +401,88 @@ ; GFX8-LABEL: udot4_acc8: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010 +; GFX8-NEXT: s_and_b32 s3, s2, s1 +; GFX8-NEXT: s_and_b32 s1, s0, s1 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s1, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s1 +; GFX9-NODL-NEXT: s_and_b32 s1, s0, s1 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s1, v3, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -470,10 +496,12 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -537,14 +565,16 @@ ; GFX8-LABEL: udot2_8: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s3, s2, s0 ; GFX8-NEXT: s_and_b32 s0, s1, s0 @@ -555,20 +585,24 @@ ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 ; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 @@ -579,20 +613,24 @@ ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_and_b32 s3, s2, s0 ; GFX9-DL-NEXT: s_and_b32 s0, s1, s0 @@ -603,29 +641,35 @@ ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s8, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s3, s0, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX10-DL-NEXT: s_and_b32 s2, s1, s8 +; GFX10-DL-NEXT: s_and_b32 s3, s0, s8 ; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -685,14 +729,16 @@ ; GFX8-LABEL: udot4_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s3, s1, s0 ; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 @@ -711,20 +757,24 @@ ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_CommutationInsideMAD: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 ; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 @@ -743,23 +793,28 @@ ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s1, v3, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -773,10 +828,12 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s1, s0, v2 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s5, s4, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -848,14 +905,16 @@ ; GFX8-LABEL: udot4_CommutationAccrossMADs: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX8-NEXT: s_and_b32 s3, s1, s0 @@ -874,20 +933,24 @@ ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 @@ -906,20 +969,24 @@ ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX9-DL-NEXT: s_and_b32 s3, s1, s0 @@ -938,35 +1005,41 @@ ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s8, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX10-DL-NEXT: s_and_b32 s5, s0, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 +; GFX10-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s1, s8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1038,129 +1111,137 @@ ; GFX8-LABEL: udot4_multiuse_mul1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s2, 0xff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s5, s3, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX8-NEXT: v_mad_u32_u24 v1, s5, v0, v1 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX8-NEXT: v_mad_u32_u24 v1, s4, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x80010 ; GFX8-NEXT: v_mad_u32_u24 v1, s6, v2, v1 -; GFX8-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX8-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX8-NEXT: v_mad_u32_u24 v0, s4, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_lshr_b32 s4, s4, 24 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX8-NEXT: s_lshr_b32 s3, s3, 24 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v0, v1 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v0, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v0, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v0, v1 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x80010 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 -; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s8, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s2, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s7, v0 -; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80010 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s2, v0 -; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s7, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1242,133 +1323,141 @@ ; GFX8-LABEL: udot4_multiuse_add1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s2, 0xff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s5, s3, s2 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX8-NEXT: s_and_b32 s5, s1, s8 ; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 ; GFX8-NEXT: v_mad_u32_u24 v0, s6, v0, v1 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s10, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, s5, v2, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s4, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: s_lshr_b32 s4, s4, 24 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: v_mad_u32_u24 v0, s8, v2, v0 -; GFX8-NEXT: s_lshr_b32 s3, s3, 24 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mad_u32_u24 v0, s3, v2, v0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mad_u32_u24 v0, s0, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_multiuse_add1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v0, v1 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_add_u32_e32 v1, s10, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v2, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v2, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, v2, v0 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s0, v2, v0 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s6, v0, v1 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX9-DL-NEXT: v_add_u32_e32 v1, s10, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v2, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s8, v2, v0 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, v2, v0 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s0, v2, v0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s8, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_and_b32 s8, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 -; GFX10-DL-NEXT: s_bfe_u32 s6, s3, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80010 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s2, v0 -; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 24 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s5, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-DL-NEXT: s_and_b32 s6, s0, s8 +; GFX10-DL-NEXT: s_and_b32 s7, s1, s8 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s9, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s5, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1450,104 +1539,116 @@ ; GFX8-LABEL: notdot4_mixedtypes: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX8-NEXT: s_sext_i32_i8 s3, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80008 -; GFX8-NEXT: s_sext_i32_i8 s3, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80010 -; GFX8-NEXT: s_bfe_u32 s1, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s2, s1, 0x80008 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80010 ; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: s_sext_i32_i8 s4, s0 -; GFX8-NEXT: s_bfe_u32 s3, s0, 0x80010 -; GFX8-NEXT: s_lshr_b32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_sext_i32_i8 s4, s1 +; GFX8-NEXT: s_bfe_u32 s3, s1, 0x80010 ; GFX8-NEXT: s_lshr_b32 s0, s0, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s3, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notdot4_mixedtypes: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s1, s2, 0x80008 -; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80010 -; GFX9-NODL-NEXT: s_bfe_u32 s1, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s2, s1, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80010 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s3, s0, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s1 +; GFX9-NODL-NEXT: s_bfe_u32 s3, s1, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notdot4_mixedtypes: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x80008 -; GFX9-DL-NEXT: s_sext_i32_i8 s3, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80010 -; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX9-DL-NEXT: s_sext_i32_i8 s3, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: s_bfe_u32 s2, s1, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x80010 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-DL-NEXT: s_sext_i32_i8 s4, s0 -; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DL-NEXT: s_sext_i32_i8 s4, s1 +; GFX9-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 ; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v5, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notdot4_mixedtypes: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1565,6 +1666,8 @@ ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1645,128 +1748,136 @@ ; GFX8-LABEL: udot4_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s2, 0xff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s5, s3, 24 -; GFX8-NEXT: s_lshr_b32 s6, s4, 24 -; GFX8-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s3 -; GFX8-NEXT: s_and_b32 s3, s3, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 8, s4 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: s_lshr_b32 s4, s0, 24 +; GFX8-NEXT: s_lshr_b32 s5, s1, 24 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX8-NEXT: s_and_b32 s0, s0, s8 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX8-NEXT: s_and_b32 s1, s1, s8 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v2, v3 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: v_mad_u32_u24 v0, s7, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 24 -; GFX9-NODL-NEXT: s_lshr_b32 s6, s4, 24 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s3 -; GFX9-NODL-NEXT: s_and_b32 s3, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s4 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX9-NODL-NEXT: s_and_b32 s0, s0, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v2, v3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v2, v3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s7, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 24 -; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 24 -; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s3 -; GFX9-DL-NEXT: s_and_b32 s3, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s4 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX9-DL-NEXT: s_and_b32 s0, s0, s8 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v2, v3 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s7, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s3, 0xff -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s9, 0xff +; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_and_b32 s7, s4, s3 -; GFX10-DL-NEXT: s_and_b32 s3, s5, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-DL-NEXT: v_and_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_and_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s5, s5, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_and_b32 s4, s0, s9 +; GFX10-DL-NEXT: s_and_b32 s5, s1, s9 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-DL-NEXT: v_and_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1831,14 +1942,16 @@ ; GFX8-LABEL: udot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s6, s1, s0 ; GFX8-NEXT: s_and_b32 s0, s2, s0 @@ -1856,90 +1969,102 @@ ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s7, s1, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-NODL-NEXT: v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_lshl_or_b32 v3, s6, 16, v3 ; GFX9-NODL-NEXT: v_lshl_or_b32 v4, s4, 16, v4 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v4, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-NODL-NEXT: v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v0, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 ; GFX9-NODL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v0, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u32_e32 v4, v2, v4 ; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 24 +; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-DL-NEXT: v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_lshl_or_b32 v3, s6, 16, v3 ; GFX9-DL-NEXT: v_lshl_or_b32 v4, s4, 16, v4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v4, v3 -; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-DL-NEXT: v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-DL-NEXT: v_and_b32_sdwa v0, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 ; GFX9-DL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v0, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v4, v2, v4 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1965,6 +2090,8 @@ ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -2042,13 +2169,15 @@ ; GFX8-LABEL: udot4_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2076,33 +2205,36 @@ ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc8_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s2, 24 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s0, v3 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-NODL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s3, s0, 24 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s1, v3 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: s_lshr_b32 s0, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s1, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v5, s2, v5 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v5, s0, v5 ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-NODL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NODL-NEXT: v_or_b32_e32 v4, v3, v4 @@ -2112,33 +2244,36 @@ ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NODL-NEXT: s_nop 0 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-DL-NEXT: s_lshr_b32 s3, s2, 24 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s0, v3 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-DL-NEXT: s_lshr_b32 s3, s0, 24 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s1, v3 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: s_lshr_b32 s0, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s2, v5 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s0, v5 ; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4 @@ -2148,17 +2283,21 @@ ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2185,6 +2324,8 @@ ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -57,128 +57,140 @@ ; GFX8-LABEL: idot8_acc32: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s19, s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX8-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX8-NEXT: s_bfe_i32 s4, s1, 0x40000 +; GFX8-NEXT: s_bfe_i32 s6, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NEXT: v_mad_i32_i24 v0, s5, v0, v1 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX8-NEXT: v_mad_i32_i24 v0, s7, v1, v0 -; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX8-NEXT: v_mad_i32_i24 v0, s2, v0, v1 +; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX8-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_bfe_i32 s12, s4, 0x4000c -; GFX8-NEXT: v_mad_i32_i24 v0, s9, v1, v0 -; GFX8-NEXT: s_bfe_i32 s11, s2, 0x4000c +; GFX8-NEXT: s_bfe_i32 s12, s1, 0x4000c +; GFX8-NEXT: v_mad_i32_i24 v0, s7, v1, v0 +; GFX8-NEXT: s_bfe_i32 s11, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v1, s12 -; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX8-NEXT: s_bfe_i32 s14, s1, 0x40010 ; GFX8-NEXT: v_mad_i32_i24 v0, s11, v1, v0 -; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40010 +; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v1, s14 -; GFX8-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX8-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX8-NEXT: s_bfe_i32 s16, s1, 0x40014 +; GFX8-NEXT: s_bfe_i32 s18, s1, 0x40018 ; GFX8-NEXT: v_mad_i32_i24 v0, s13, v1, v0 -; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40014 +; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 -; GFX8-NEXT: s_bfe_i32 s17, s2, 0x40018 +; GFX8-NEXT: s_bfe_i32 s17, s0, 0x40018 ; GFX8-NEXT: v_mad_i32_i24 v0, s15, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: s_ashr_i32 s4, s4, 28 +; GFX8-NEXT: s_ashr_i32 s1, s1, 28 ; GFX8-NEXT: v_mad_i32_i24 v0, s17, v1, v0 -; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 28 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_acc32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s19, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX9-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s4, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mad_i32_i24 v0, s5, v0, v1 -; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX9-NEXT: v_mad_i32_i24 v0, s7, v1, v0 -; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX9-NEXT: v_mad_i32_i24 v0, s2, v0, v1 +; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX9-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: s_bfe_i32 s12, s4, 0x4000c -; GFX9-NEXT: v_mad_i32_i24 v0, s9, v1, v0 -; GFX9-NEXT: s_bfe_i32 s11, s2, 0x4000c +; GFX9-NEXT: s_bfe_i32 s12, s1, 0x4000c +; GFX9-NEXT: v_mad_i32_i24 v0, s7, v1, v0 +; GFX9-NEXT: s_bfe_i32 s11, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40010 ; GFX9-NEXT: v_mad_i32_i24 v0, s11, v1, v0 -; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s18, s1, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v0, s13, v1, v0 -; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40014 +; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_bfe_i32 s17, s2, 0x40018 +; GFX9-NEXT: s_bfe_i32 s17, s0, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v0, s15, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: s_ashr_i32 s4, s4, 28 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: v_mad_i32_i24 v0, s17, v1, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s0, v2, v3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s4, s5, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -319,165 +331,177 @@ ; GFX8-LABEL: idot8_acc16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40004 +; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s1, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40004 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40000 -; GFX8-NEXT: s_lshr_b32 s1, s0, 12 -; GFX8-NEXT: s_lshr_b32 s7, s2, 12 +; GFX8-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX8-NEXT: s_lshr_b32 s2, s1, 12 +; GFX8-NEXT: s_lshr_b32 s7, s0, 12 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s1 +; GFX8-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s2 ; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s7 ; GFX8-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX8-NEXT: s_bfe_i32 s1, s2, 0x40010 +; GFX8-NEXT: s_bfe_i32 s2, s0, 0x40010 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s1 -; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40010 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40018 +; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: s_bfe_i32 s4, s1, 0x40010 +; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40018 ; GFX8-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NEXT: s_bfe_i32 s1, s0, 0x40014 -; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40018 -; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v10, s7 +; GFX8-NEXT: s_bfe_i32 s2, s1, 0x40014 +; GFX8-NEXT: s_bfe_i32 s5, s1, 0x40018 ; GFX8-NEXT: s_ashr_i32 s0, s0, 28 +; GFX8-NEXT: v_mov_b32_e32 v10, s7 +; GFX8-NEXT: s_ashr_i32 s1, s1, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v2, s6, v3, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s8, v5, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s4, v8, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v9, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v9, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s5, v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_acc16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s1, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40000 -; GFX9-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-NEXT: s_lshr_b32 s7, s2, 12 +; GFX9-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX9-NEXT: s_lshr_b32 s2, s1, 12 +; GFX9-NEXT: s_lshr_b32 s7, s0, 12 ; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s1 +; GFX9-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s2 ; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7 ; GFX9-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX9-NEXT: s_bfe_i32 s1, s2, 0x40010 +; GFX9-NEXT: s_bfe_i32 s2, s0, 0x40010 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s1 -; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40010 -; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40018 +; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-NEXT: s_bfe_i32 s4, s1, 0x40010 +; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40018 ; GFX9-NEXT: v_mov_b32_e32 v9, s5 -; GFX9-NEXT: s_bfe_i32 s1, s0, 0x40014 -; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40018 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: s_bfe_i32 s2, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40018 ; GFX9-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_i32_i24 v2, s6, v3, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s8, v5, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NEXT: v_mad_u32_u24 v2, v6, v7, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s4, v8, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s1, v9, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s2, v9, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s5, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 -; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-DL-NEXT: s_lshr_b32 s7, s2, 12 +; GFX9-DL-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX9-DL-NEXT: s_lshr_b32 s2, s1, 12 +; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 12 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s1 +; GFX9-DL-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s2 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s1 -; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-DL-NEXT: s_bfe_i32 s4, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40018 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s5 -; GFX9-DL-NEXT: s_bfe_i32 s1, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-DL-NEXT: s_bfe_i32 s2, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40018 ; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v3, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v5, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v6, v7, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v8, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v9, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v9, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -514,6 +538,8 @@ ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -653,174 +679,182 @@ ; GFX8-LABEL: idot8_acc8: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s7, s6, 0x40000 -; GFX8-NEXT: s_lshr_b32 s4, s6, 12 -; GFX8-NEXT: s_bfe_i32 s9, s6, 0x40004 -; GFX8-NEXT: s_bfe_i32 s11, s6, 0x40008 -; GFX8-NEXT: s_lshr_b32 s1, s0, 12 -; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40000 +; GFX8-NEXT: s_lshr_b32 s5, s0, 12 +; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40004 +; GFX8-NEXT: s_lshr_b32 s4, s1, 12 +; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40008 +; GFX8-NEXT: s_bfe_i32 s6, s1, 0x40000 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX8-NEXT: s_bfe_i32 s10, s0, 0x40008 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s5 +; GFX8-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX8-NEXT: s_bfe_i32 s10, s1, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: v_mov_b32_e32 v7, s9 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX8-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX8-NEXT: s_bfe_i32 s13, s6, 0x40010 +; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40010 ; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX8-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX8-NEXT: s_bfe_i32 s15, s6, 0x40014 -; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40010 +; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40014 +; GFX8-NEXT: s_bfe_i32 s12, s1, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v8, s13 -; GFX8-NEXT: s_bfe_i32 s17, s6, 0x40018 -; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40014 +; GFX8-NEXT: s_bfe_i32 s17, s0, 0x40018 +; GFX8-NEXT: s_bfe_i32 s14, s1, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v9, s15 -; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX8-NEXT: s_ashr_i32 s6, s6, 28 -; GFX8-NEXT: v_mov_b32_e32 v10, s17 +; GFX8-NEXT: s_bfe_i32 s16, s1, 0x40018 ; GFX8-NEXT: s_ashr_i32 s0, s0, 28 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, s17 +; GFX8-NEXT: s_ashr_i32 s1, s1, 28 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v6, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s12, v8, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s14, v9, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_acc8: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_movk_i32 s2, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s7, s6, 0x40000 -; GFX9-NEXT: s_lshr_b32 s4, s6, 12 -; GFX9-NEXT: s_bfe_i32 s9, s6, 0x40004 -; GFX9-NEXT: s_bfe_i32 s11, s6, 0x40008 -; GFX9-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40000 +; GFX9-NEXT: s_lshr_b32 s5, s0, 12 +; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40004 +; GFX9-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40008 +; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40000 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40008 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s5 +; GFX9-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s1, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX9-NEXT: s_bfe_i32 s13, s6, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40010 ; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-NEXT: s_bfe_i32 s15, s6, 0x40014 -; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 +; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40014 +; GFX9-NEXT: s_bfe_i32 s12, s1, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v8, s13 -; GFX9-NEXT: s_bfe_i32 s17, s6, 0x40018 -; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s0, 0x40018 +; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v9, s15 -; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX9-NEXT: s_ashr_i32 s6, s6, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40018 ; GFX9-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s6, v6, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s12, v8, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s14, v9, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-DL-NEXT: s_movk_i32 s2, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s7, s6, 0x40000 -; GFX9-DL-NEXT: s_lshr_b32 s4, s6, 12 -; GFX9-DL-NEXT: s_bfe_i32 s9, s6, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s11, s6, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40000 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 12 +; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40008 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s5 +; GFX9-DL-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX9-DL-NEXT: s_bfe_i32 s13, s6, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40010 ; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-DL-NEXT: s_bfe_i32 s15, s6, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s12, s1, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s13 -; GFX9-DL-NEXT: s_bfe_i32 s17, s6, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s17, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s14, s1, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s15 -; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s6, s6, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-DL-NEXT: s_bfe_i32 s16, s1, 0x40018 ; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v6, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v8, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v9, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -857,6 +891,8 @@ ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -982,189 +1018,201 @@ ; GFX8-LABEL: idot8_multiuses_mul1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s19, s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX8-NEXT: s_bfe_i32 s4, s1, 0x40000 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NEXT: v_mad_i32_i24 v1, s5, v0, v1 -; GFX8-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX8-NEXT: v_mad_i32_i24 v0, s5, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: v_mad_i32_i24 v0, s7, v2, v0 -; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX8-NEXT: v_mad_i32_i24 v1, s2, v0, v1 +; GFX8-NEXT: s_bfe_i32 s6, s1, 0x40004 +; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40004 +; GFX8-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX8-NEXT: v_mad_i32_i24 v0, s2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mad_i32_i24 v0, s5, v2, v0 +; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NEXT: s_bfe_i32 s12, s4, 0x4000c -; GFX8-NEXT: v_mad_i32_i24 v0, s9, v2, v0 -; GFX8-NEXT: s_bfe_i32 s11, s2, 0x4000c +; GFX8-NEXT: s_bfe_i32 s12, s1, 0x4000c +; GFX8-NEXT: v_mad_i32_i24 v0, s7, v2, v0 +; GFX8-NEXT: s_bfe_i32 s11, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX8-NEXT: s_bfe_i32 s14, s1, 0x40010 ; GFX8-NEXT: v_mad_i32_i24 v0, s11, v2, v0 -; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40010 +; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX8-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX8-NEXT: s_bfe_i32 s16, s1, 0x40014 +; GFX8-NEXT: s_bfe_i32 s18, s1, 0x40018 ; GFX8-NEXT: v_mad_i32_i24 v0, s13, v2, v0 -; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40014 +; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 -; GFX8-NEXT: s_bfe_i32 s17, s2, 0x40018 +; GFX8-NEXT: s_bfe_i32 s17, s0, 0x40018 ; GFX8-NEXT: v_mad_i32_i24 v0, s15, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: s_ashr_i32 s4, s4, 28 +; GFX8-NEXT: s_ashr_i32 s1, s1, 28 ; GFX8-NEXT: v_mad_i32_i24 v0, s17, v2, v0 -; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 28 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mad_i32_i24 v0, s0, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s19, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s4, s1, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mad_i32_i24 v1, s5, v0, v1 -; GFX9-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX9-NEXT: v_mad_i32_i24 v0, s5, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mad_i32_i24 v0, s7, v2, v0 -; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX9-NEXT: v_mad_i32_i24 v1, s2, v0, v1 +; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX9-NEXT: v_mad_i32_i24 v0, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mad_i32_i24 v0, s5, v2, v0 +; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: s_bfe_i32 s12, s4, 0x4000c -; GFX9-NEXT: v_mad_i32_i24 v0, s9, v2, v0 -; GFX9-NEXT: s_bfe_i32 s11, s2, 0x4000c +; GFX9-NEXT: s_bfe_i32 s12, s1, 0x4000c +; GFX9-NEXT: v_mad_i32_i24 v0, s7, v2, v0 +; GFX9-NEXT: s_bfe_i32 s11, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40010 ; GFX9-NEXT: v_mad_i32_i24 v0, s11, v2, v0 -; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s18, s1, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v0, s13, v2, v0 -; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40014 +; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: s_bfe_i32 s17, s2, 0x40018 +; GFX9-NEXT: s_bfe_i32 s17, s0, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v0, s15, v2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: s_ashr_i32 s4, s4, 28 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: v_mad_i32_i24 v0, s17, v2, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mad_i32_i24 v0, s0, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s19, s[8:9], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-DL-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s4, s1, 0x40000 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v0, v1 -; GFX9-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s7, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v0, v1 +; GFX9-DL-NEXT: s_bfe_i32 s6, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s2, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x4000c -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s9, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s11, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_i32 s12, s1, 0x4000c +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s7, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s14, s1, 0x40010 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s11, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s13, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-DL-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s16, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s18, s1, 0x40018 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s13, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s15, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-DL-NEXT: s_bfe_i32 s17, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s17, s0, 0x40018 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s15, v2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s17, v2, v0 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s0, v2, v0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[8:9], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40000 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v0 -; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40008 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 -; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x4000c -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 -; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40010 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 -; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 -; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40018 -; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s4, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40004 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v0 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s6, v1 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x4000c +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s6, v1 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40018 +; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s6, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1304,237 +1352,252 @@ ; GFX8-LABEL: idot8_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s5, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s11, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s9, s5, 4 -; GFX8-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s5, 20 -; GFX8-NEXT: s_lshl_b32 s11, s5, 8 -; GFX8-NEXT: s_lshl_b32 s13, s5, 12 -; GFX8-NEXT: s_lshl_b32 s15, s5, 16 -; GFX8-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s5, 24 -; GFX8-NEXT: s_lshl_b32 s5, s5, 28 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 4 -; GFX8-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 8 -; GFX8-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 12 -; GFX8-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 16 -; GFX8-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 20 -; GFX8-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 24 -; GFX8-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 28 -; GFX8-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 -; GFX8-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX8-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s5, s1, 4 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s1, 20 +; GFX8-NEXT: s_lshl_b32 s7, s1, 8 +; GFX8-NEXT: s_lshl_b32 s13, s1, 12 +; GFX8-NEXT: s_lshl_b32 s15, s1, 16 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s1, 28 +; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s11, 4 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s11, 8 +; GFX8-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s11, 12 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s11, 16 +; GFX8-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s11, 20 +; GFX8-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s11, 24 +; GFX8-NEXT: s_ashr_i64 s[34:35], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s11, 28 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[10:11], 60 +; GFX8-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 +; GFX8-NEXT: s_load_dword s1, s[8:9], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[4:5], 60 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_i32_i24 v0, s0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s34 ; GFX8-NEXT: v_mad_i32_i24 v0, s20, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s32 ; GFX8-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 ; GFX8-NEXT: v_mov_b32_e32 v1, s30 ; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 ; GFX8-NEXT: v_mov_b32_e32 v1, s28 ; GFX8-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX8-NEXT: v_mov_b32_e32 v1, s26 -; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s24 ; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s22 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s5, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s7, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s9, s5, 4 -; GFX9-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s5, 20 -; GFX9-NEXT: s_lshl_b32 s11, s5, 8 -; GFX9-NEXT: s_lshl_b32 s13, s5, 12 -; GFX9-NEXT: s_lshl_b32 s15, s5, 16 -; GFX9-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s5, 24 -; GFX9-NEXT: s_lshl_b32 s5, s5, 28 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 4 -; GFX9-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 8 -; GFX9-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 12 -; GFX9-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 16 -; GFX9-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 20 -; GFX9-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 24 -; GFX9-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 28 -; GFX9-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX9-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s5, s1, 4 +; GFX9-NEXT: s_ashr_i64 s[16:17], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s1, 20 +; GFX9-NEXT: s_lshl_b32 s7, s1, 8 +; GFX9-NEXT: s_lshl_b32 s13, s1, 12 +; GFX9-NEXT: s_lshl_b32 s15, s1, 16 +; GFX9-NEXT: s_ashr_i64 s[18:19], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s1, 28 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s11, 4 +; GFX9-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s11, 8 +; GFX9-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s11, 12 +; GFX9-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s11, 16 +; GFX9-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s11, 20 +; GFX9-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s11, 24 +; GFX9-NEXT: s_ashr_i64 s[34:35], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s11, 28 +; GFX9-NEXT: s_ashr_i64 s[22:23], s[10:11], 60 +; GFX9-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 +; GFX9-NEXT: s_load_dword s1, s[8:9], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: s_ashr_i64 s[20:21], s[4:5], 60 +; GFX9-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX9-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mad_i32_i24 v0, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s34 ; GFX9-NEXT: v_mad_i32_i24 v0, s20, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s32 ; GFX9-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX9-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 ; GFX9-NEXT: v_mov_b32_e32 v1, s30 ; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX9-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 ; GFX9-NEXT: v_mov_b32_e32 v1, s28 ; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX9-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX9-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX9-NEXT: v_mov_b32_e32 v1, s26 -; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-NEXT: v_mad_i32_i24 v0, s6, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s24 ; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mad_i32_i24 v2, s4, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s11, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s5, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s7, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s1, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s1, 20 +; GFX9-DL-NEXT: s_lshl_b32 s7, s1, 8 +; GFX9-DL-NEXT: s_lshl_b32 s13, s1, 12 +; GFX9-DL-NEXT: s_lshl_b32 s15, s1, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s1, 24 +; GFX9-DL-NEXT: s_lshl_b32 s1, s1, 28 +; GFX9-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s11, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s11, 8 +; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s11, 12 +; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s11, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s11, 20 +; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s11, 24 +; GFX9-DL-NEXT: s_ashr_i64 s[34:35], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s11, 28 +; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[10:11], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 +; GFX9-DL-NEXT: s_load_dword s1, s[8:9], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[4:5], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 4 -; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 20 -; GFX9-DL-NEXT: s_lshl_b32 s11, s5, 8 -; GFX9-DL-NEXT: s_lshl_b32 s13, s5, 12 -; GFX9-DL-NEXT: s_lshl_b32 s15, s5, 16 -; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 24 -; GFX9-DL-NEXT: s_lshl_b32 s5, s5, 28 -; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 4 -; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 8 -; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 12 -; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 16 -; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 20 -; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 24 -; GFX9-DL-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 28 -; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 -; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s34 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s20, v1, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s32 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s30 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s28 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX9-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s26 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s24 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s5, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s7, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s11, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 28 -; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 28 -; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 24 -; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 24 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX10-DL-NEXT: s_lshl_b32 s5, s1, 28 +; GFX10-DL-NEXT: s_lshl_b32 s7, s11, 28 +; GFX10-DL-NEXT: s_lshl_b32 s13, s1, 24 +; GFX10-DL-NEXT: s_lshl_b32 s15, s11, 24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 -; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 20 -; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 20 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 -; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 16 -; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 16 -; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX10-DL-NEXT: s_lshl_b32 s5, s1, 20 +; GFX10-DL-NEXT: s_lshl_b32 s7, s11, 20 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s6, v0 +; GFX10-DL-NEXT: s_lshl_b32 s13, s1, 16 +; GFX10-DL-NEXT: s_lshl_b32 s15, s11, 16 +; GFX10-DL-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 -; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 12 -; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 12 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 -; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 8 -; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 8 -; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX10-DL-NEXT: s_lshl_b32 s5, s1, 12 +; GFX10-DL-NEXT: s_lshl_b32 s7, s11, 12 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s6, v0 +; GFX10-DL-NEXT: s_lshl_b32 s13, s1, 8 +; GFX10-DL-NEXT: s_lshl_b32 s15, s11, 8 +; GFX10-DL-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 -; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 4 -; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 4 +; GFX10-DL-NEXT: s_lshl_b32 s5, s1, 4 +; GFX10-DL-NEXT: s_lshl_b32 s7, s11, 4 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 -; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s6, v0 ; GFX10-DL-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s6, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s6, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s10, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1635,52 +1698,54 @@ ; GFX8-LABEL: idot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s5, s[4:5], 0x0 +; GFX8-NEXT: s_lshl_b32 s7, s1, 28 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b32 s5, s7, 28 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s9, s7, 24 -; GFX8-NEXT: s_lshl_b32 s11, s7, 20 -; GFX8-NEXT: s_lshl_b32 s5, s1, 28 -; GFX8-NEXT: s_ashr_i64 s[14:15], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s1, 20 -; GFX8-NEXT: s_lshl_b32 s13, s1, 24 +; GFX8-NEXT: s_lshl_b32 s7, s5, 28 +; GFX8-NEXT: s_lshl_b32 s9, s1, 24 +; GFX8-NEXT: s_lshl_b32 s11, s1, 20 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[6:7], 60 +; GFX8-NEXT: s_lshl_b32 s7, s5, 20 +; GFX8-NEXT: s_lshl_b32 s13, s5, 24 ; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 16 +; GFX8-NEXT: s_lshl_b32 s7, s1, 16 ; GFX8-NEXT: v_mov_b32_e32 v5, s8 -; GFX8-NEXT: s_lshl_b32 s9, s1, 16 -; GFX8-NEXT: s_lshl_b32 s11, s7, 12 -; GFX8-NEXT: s_ashr_i64 s[16:17], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s9, s5, 16 +; GFX8-NEXT: s_lshl_b32 s11, s1, 12 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[6:7], 60 ; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX8-NEXT: v_mul_i32_i24_e32 v4, s4, v4 -; GFX8-NEXT: s_lshl_b32 s5, s1, 12 -; GFX8-NEXT: s_lshl_b32 s9, s7, 8 +; GFX8-NEXT: v_mul_i32_i24_e32 v4, s6, v4 +; GFX8-NEXT: s_lshl_b32 s7, s5, 12 +; GFX8-NEXT: s_lshl_b32 s9, s1, 8 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 ; GFX8-NEXT: v_mov_b32_e32 v6, s16 ; GFX8-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s13, s1, 8 -; GFX8-NEXT: s_ashr_i64 s[18:19], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 4 +; GFX8-NEXT: s_lshl_b32 s13, s5, 8 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[6:7], 60 +; GFX8-NEXT: s_lshl_b32 s7, s1, 4 ; GFX8-NEXT: v_mov_b32_e32 v7, s10 -; GFX8-NEXT: s_lshl_b32 s9, s1, 4 -; GFX8-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s9, s5, 4 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[6:7], 60 ; GFX8-NEXT: s_ashr_i64 s[22:23], s[12:13], 60 ; GFX8-NEXT: v_mov_b32_e32 v8, s20 -; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 ; GFX8-NEXT: s_ashr_i64 s[26:27], s[8:9], 60 ; GFX8-NEXT: v_mov_b32_e32 v9, s24 -; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s12, v5, v2 @@ -1689,30 +1754,34 @@ ; GFX8-NEXT: v_mad_i32_i24 v2, s18, v7, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s22, v8, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s26, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, 15 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s12, s2, 28 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s10, s2 -; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s12 +; GFX9-NEXT: s_and_b32 s1, s0, 15 +; GFX9-NEXT: s_bfe_u32 s2, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s12, s0, 28 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40014 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s10, s0 +; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s12 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-NEXT: s_lshr_b32 s13, s6, 28 ; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 @@ -1721,34 +1790,36 @@ ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x4000c ; GFX9-NEXT: s_and_b32 s18, s6, 15 ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s18, s6 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s16, s17 -; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s14, s15 +; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s18, s6 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s16, s17 +; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s1 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s14, s15 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v1, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6 -; GFX9-NEXT: global_load_ushort v6, v[0:1], off -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s13 -; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s7, s13 +; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v6, v4, v6 ; GFX9-NEXT: v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1758,28 +1829,32 @@ ; GFX9-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s4, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s12, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s10, s2 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s12 +; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s11, s0, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s12, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x40014 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s10, s0 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s12 ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s13, s6, 28 ; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010 @@ -1788,34 +1863,36 @@ ; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x4000c ; GFX9-DL-NEXT: s_and_b32 s18, s6, 15 ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s18, s6 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s8, s9 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s16, s17 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s14, s15 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s18, s6 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s1 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s4, s5 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s16, s17 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s1 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s14, s15 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v1, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v0, v4 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v6 -; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s7, s13 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s7, s13 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v7 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v6, v4, v6 ; GFX9-DL-NEXT: v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1825,17 +1902,21 @@ ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1893,6 +1974,8 @@ ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2014,37 +2097,39 @@ ; GFX8-LABEL: idot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b32 s13, s1, 24 ; GFX8-NEXT: s_lshl_b32 s17, s1, 16 -; GFX8-NEXT: s_ashr_i64 s[22:23], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s25, s5, 24 -; GFX8-NEXT: s_lshl_b32 s27, s5, 28 -; GFX8-NEXT: s_lshl_b32 s29, s5, 16 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[8:9], 60 +; GFX8-NEXT: s_lshl_b32 s25, s9, 24 +; GFX8-NEXT: s_lshl_b32 s27, s9, 28 +; GFX8-NEXT: s_lshl_b32 s29, s9, 16 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 ; GFX8-NEXT: s_lshl_b32 s15, s1, 28 -; GFX8-NEXT: s_lshl_b32 s19, s5, 8 -; GFX8-NEXT: s_lshl_b32 s21, s5, 12 -; GFX8-NEXT: s_lshl_b32 s23, s5, 4 -; GFX8-NEXT: s_lshl_b32 s5, s5, 20 +; GFX8-NEXT: s_lshl_b32 s19, s9, 8 +; GFX8-NEXT: s_lshl_b32 s21, s9, 12 +; GFX8-NEXT: s_lshl_b32 s23, s9, 4 +; GFX8-NEXT: s_lshl_b32 s9, s9, 20 ; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 ; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 ; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60 ; GFX8-NEXT: s_ashr_i64 s[26:27], s[26:27], 60 ; GFX8-NEXT: s_ashr_i64 s[28:29], s[28:29], 60 -; GFX8-NEXT: s_lshl_b32 s7, s1, 8 -; GFX8-NEXT: s_lshl_b32 s9, s1, 12 +; GFX8-NEXT: s_lshl_b32 s5, s1, 8 +; GFX8-NEXT: s_lshl_b32 s7, s1, 12 ; GFX8-NEXT: s_lshl_b32 s11, s1, 4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 20 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 ; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 ; GFX8-NEXT: v_mov_b32_e32 v6, s28 ; GFX8-NEXT: v_mov_b32_e32 v7, s16 @@ -2055,10 +2140,10 @@ ; GFX8-NEXT: v_mul_i32_i24_e32 v7, s14, v8 ; GFX8-NEXT: v_mul_i32_i24_sdwa v8, v10, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 ; GFX8-NEXT: v_mul_i32_i24_e32 v5, s0, v5 ; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 ; GFX8-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 ; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v6, s2, v7 @@ -2068,13 +2153,13 @@ ; GFX8-NEXT: s_ashr_i64 s[32:33], s[22:23], 60 ; GFX8-NEXT: v_mul_i32_i24_sdwa v3, v4, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX8-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NEXT: v_mov_b32_e32 v12, s18 -; GFX8-NEXT: v_mov_b32_e32 v13, s6 +; GFX8-NEXT: v_mov_b32_e32 v13, s4 ; GFX8-NEXT: s_ashr_i64 s[30:31], s[10:11], 60 ; GFX8-NEXT: v_mov_b32_e32 v11, s32 -; GFX8-NEXT: v_mul_i32_i24_e32 v4, s8, v4 +; GFX8-NEXT: v_mul_i32_i24_e32 v4, s6, v4 ; GFX8-NEXT: v_mul_i32_i24_sdwa v10, v13, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v5 ; GFX8-NEXT: v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2092,17 +2177,21 @@ ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2180,17 +2269,21 @@ ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2268,18 +2361,22 @@ ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2361,6 +2458,8 @@ ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -57,11 +57,13 @@ ; GFX8-LABEL: udot8_acc32: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s19, s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 ; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40018 @@ -71,17 +73,17 @@ ; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40008 ; GFX8-NEXT: s_bfe_u32 s18, s6, 0x40004 ; GFX8-NEXT: s_and_b32 s6, s6, 15 -; GFX8-NEXT: s_lshr_b32 s4, s2, 28 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 -; GFX8-NEXT: s_and_b32 s2, s2, 15 +; GFX8-NEXT: s_lshr_b32 s1, s0, 28 +; GFX8-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s11, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 ; GFX8-NEXT: v_mad_u32_u24 v0, s12, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 @@ -89,26 +91,30 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s16 ; GFX8-NEXT: v_mad_u32_u24 v0, s10, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s15 -; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s14 -; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 -; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s19, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40018 @@ -118,17 +124,17 @@ ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s18, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: v_mad_u32_u24 v0, s12, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -136,49 +142,57 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: v_mad_u32_u24 v0, s10, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX9-NEXT: v_mad_u32_u24 v0, s5, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX9-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s0, v2, v3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s4, s5, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -302,39 +316,41 @@ ; GFX8-LABEL: udot8_acc16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s2, s0, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_and_b32 s2, s1, 15 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s11, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40018 ; GFX8-NEXT: s_lshr_b32 s0, s0, 28 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: s_lshr_b32 s1, s1, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 @@ -342,47 +358,50 @@ ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s10, v8, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_and_b32 s2, s0, 15 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_and_b32 s2, s1, 15 ; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_and_b32 s1, s0, 15 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s4, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 ; GFX9-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: s_lshr_b32 s1, s1, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 @@ -390,47 +409,50 @@ ; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s10, v8, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: s_and_b32 s2, s1, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s11, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 @@ -438,19 +460,23 @@ ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v8, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -481,6 +507,8 @@ ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -604,39 +632,41 @@ ; GFX8-LABEL: udot8_acc8: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s2, s0, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_and_b32 s2, s1, 15 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s11, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40018 ; GFX8-NEXT: s_lshr_b32 s0, s0, 28 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: s_lshr_b32 s1, s1, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 @@ -644,47 +674,50 @@ ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s10, v8, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc8: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_and_b32 s2, s0, 15 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_and_b32 s2, s1, 15 ; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_and_b32 s1, s0, 15 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s4, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 ; GFX9-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: s_lshr_b32 s1, s1, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 @@ -692,47 +725,50 @@ ; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s10, v8, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: s_and_b32 s2, s1, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s11, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 @@ -740,19 +776,23 @@ ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v8, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -783,6 +823,8 @@ ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -907,41 +949,43 @@ ; GFX8-LABEL: udot8_acc4: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s2, s0, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX8-NEXT: s_and_b32 s2, s1, 15 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x40004 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s1, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x40008 ; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: s_lshr_b32 s11, s2, 28 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX8-NEXT: s_lshr_b32 s11, s0, 28 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s2 -; GFX8-NEXT: s_lshr_b32 s0, s0, 28 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s0 +; GFX8-NEXT: s_lshr_b32 s1, s1, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 @@ -950,49 +994,52 @@ ; GFX8-NEXT: v_mad_u32_u24 v2, s9, v8, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s10, v9, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc4: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: s_and_b32 s1, s0, 15 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-NEXT: s_and_b32 s2, s0, 15 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_and_b32 s2, s1, 15 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s5, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s1, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s4, s1, 0x40008 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 ; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-NEXT: s_lshr_b32 s11, s0, 28 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s9, s1, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v9, s0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 @@ -1001,49 +1048,52 @@ ; GFX9-NEXT: v_mad_u32_u24 v2, s9, v8, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s10, v9, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: s_and_b32 s2, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-DL-NEXT: s_lshr_b32 s11, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s0 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 @@ -1052,19 +1102,23 @@ ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v8, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v9, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1098,6 +1152,8 @@ ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1206,41 +1262,43 @@ ; GFX8-LABEL: udot8_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s2, s0, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX8-NEXT: s_and_b32 s2, s1, 15 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x40004 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s1, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x40008 ; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: s_lshr_b32 s11, s2, 28 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX8-NEXT: s_lshr_b32 s11, s0, 28 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s2 -; GFX8-NEXT: s_lshr_b32 s0, s0, 28 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s0 +; GFX8-NEXT: s_lshr_b32 s1, s1, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 @@ -1249,49 +1307,52 @@ ; GFX8-NEXT: v_mad_u32_u24 v2, s9, v8, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s10, v9, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_CommutationInsideMAD: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: s_and_b32 s1, s0, 15 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-NEXT: s_and_b32 s2, s0, 15 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_and_b32 s2, s1, 15 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s5, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s1, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s4, s1, 0x40008 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 ; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-NEXT: s_lshr_b32 s11, s0, 28 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s9, s1, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v9, s0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 @@ -1300,49 +1361,52 @@ ; GFX9-NEXT: v_mad_u32_u24 v2, s9, v8, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s10, v9, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: s_and_b32 s2, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-DL-NEXT: s_lshr_b32 s11, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s0 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 @@ -1351,19 +1415,23 @@ ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v8, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v9, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1397,6 +1465,8 @@ ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1504,11 +1574,13 @@ ; GFX8-LABEL: udot8_multiuses_mul1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s19, s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s18, s6, 0x40004 ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 @@ -1518,46 +1590,50 @@ ; GFX8-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40008 ; GFX8-NEXT: s_and_b32 s6, s6, 15 -; GFX8-NEXT: s_lshr_b32 s4, s2, 28 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 -; GFX8-NEXT: s_and_b32 s2, s2, 15 +; GFX8-NEXT: s_lshr_b32 s1, s0, 28 +; GFX8-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s11, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v1, s0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX8-NEXT: v_mad_u32_u24 v1, s12, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s17 ; GFX8-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mad_u32_u24 v1, s10, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s15 -; GFX8-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX8-NEXT: v_mad_u32_u24 v1, s5, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX8-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 -; GFX8-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX8-NEXT: v_mad_u32_u24 v1, s2, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX8-NEXT: v_mad_u32_u24 v1, s1, v2, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s19, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s18, s6, 0x40004 ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 @@ -1567,46 +1643,50 @@ ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40008 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX9-NEXT: v_mad_u32_u24 v1, s12, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 ; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s15 -; GFX9-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s5, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s2, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s1, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s19, s[8:9], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s18, s6, 0x40004 ; GFX9-DL-NEXT: s_lshr_b32 s7, s6, 28 @@ -1616,77 +1696,82 @@ ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x40008 ; GFX9-DL-NEXT: s_and_b32 s6, s6, 15 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 +; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s11, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s12, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s15 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[8:9], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s7, s4, 15 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v0 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v1 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v1 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s4, v1 +; GFX10-DL-NEXT: s_and_b32 s4, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s5, s1, 15 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s6, v0 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s7, s10, v1 +; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s6, v1 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s7, s10, v1 +; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s6, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s7, s10, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1810,11 +1895,13 @@ ; GFX8-LABEL: udot8_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s19, s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 ; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40018 @@ -1824,17 +1911,17 @@ ; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40008 ; GFX8-NEXT: s_bfe_u32 s18, s6, 0x40004 ; GFX8-NEXT: s_and_b32 s6, s6, 15 -; GFX8-NEXT: s_lshr_b32 s4, s2, 28 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 -; GFX8-NEXT: s_and_b32 s2, s2, 15 +; GFX8-NEXT: s_lshr_b32 s1, s0, 28 +; GFX8-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s11, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 ; GFX8-NEXT: v_mad_u32_u24 v0, s12, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 @@ -1842,26 +1929,30 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s16 ; GFX8-NEXT: v_mad_u32_u24 v0, s10, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s15 -; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s14 -; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 -; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s19, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40018 @@ -1871,17 +1962,17 @@ ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s18, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: v_mad_u32_u24 v0, s12, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -1889,49 +1980,57 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: v_mad_u32_u24 v0, s10, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX9-NEXT: v_mad_u32_u24 v0, s5, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX9-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s0, v2, v3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s4, s5, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2030,39 +2129,41 @@ ; GFX8-LABEL: udot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s2, s0, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_and_b32 s2, s1, 15 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s11, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40018 ; GFX8-NEXT: s_lshr_b32 s0, s0, 28 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: s_lshr_b32 s1, s1, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 @@ -2070,54 +2171,57 @@ ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s10, v8, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s13, s2, 28 -; GFX9-NEXT: s_and_b32 s4, s0, 15 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s2, s0, 15 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s4, s1, 15 +; GFX9-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s13, s0, 28 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX9-NEXT: s_bfe_u32 s1, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s12, s0 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s2, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-NEXT: s_lshr_b32 s1, s1, 28 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s8, s0 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s12, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, s0, v4 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s8, s2 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, s1, v4 ; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s9, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s6, s7 ; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v3 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, s1, v5 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s5 ; GFX9-NEXT: v_mov_b32_e32 v6, s0 ; GFX9-NEXT: v_pk_mul_lo_u16 v6, s1, v6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2127,52 +2231,55 @@ ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 28 -; GFX9-DL-NEXT: s_and_b32 s4, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s11, s0, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s13, s0, 28 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s12, s0 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s2, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 28 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s12, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s0, v4 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s1, v4 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s9, s10 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s7 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v3 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s1, v5 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s2, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s0 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, s1, v6 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v5 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2182,17 +2289,21 @@ ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2234,6 +2345,8 @@ ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2347,14 +2460,16 @@ ; GFX8-LABEL: udot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s8, s1, 0x40004 ; GFX8-NEXT: s_bfe_u32 s10, s1, 0x4000c @@ -2411,17 +2526,21 @@ ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2477,17 +2596,21 @@ ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2543,18 +2666,22 @@ ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2606,6 +2733,8 @@ ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2694,41 +2823,43 @@ ; GFX8-LABEL: udot8_acc4_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s2, s0, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX8-NEXT: s_and_b32 s2, s1, 15 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x40004 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s1, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x40008 ; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: s_lshr_b32 s11, s2, 28 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX8-NEXT: s_lshr_b32 s11, s0, 28 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s2 -; GFX8-NEXT: s_lshr_b32 s0, s0, 28 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s0 +; GFX8-NEXT: s_lshr_b32 s1, s1, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 @@ -2737,49 +2868,52 @@ ; GFX8-NEXT: v_mad_u32_u24 v2, s9, v8, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s10, v9, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc4_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: s_and_b32 s1, s0, 15 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-NEXT: s_and_b32 s2, s0, 15 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_and_b32 s2, s1, 15 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s5, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s1, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s4, s1, 0x40008 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 ; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-NEXT: s_lshr_b32 s11, s0, 28 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s9, s1, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v9, s0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 @@ -2788,49 +2922,52 @@ ; GFX9-NEXT: v_mad_u32_u24 v2, s9, v8, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s10, v9, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: s_and_b32 s2, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-DL-NEXT: s_lshr_b32 s11, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s0 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 @@ -2839,19 +2976,23 @@ ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v8, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v9, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2885,6 +3026,8 @@ ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2968,33 +3111,34 @@ ; GFX8-LABEL: udot8_variant1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s4, s2, 15 -; GFX8-NEXT: s_and_b32 s5, s3, 15 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s16, s2, 0x40018 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: s_and_b32 s4, s0, 15 +; GFX8-NEXT: s_and_b32 s5, s1, 15 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s16, s0, 0x40018 +; GFX8-NEXT: s_lshr_b32 s0, s0, 28 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 ; GFX8-NEXT: v_mad_u32_u24 v0, s5, v0, v1 -; GFX8-NEXT: s_bfe_u32 s7, s3, 0x40004 -; GFX8-NEXT: s_bfe_u32 s9, s3, 0x40008 -; GFX8-NEXT: s_bfe_u32 s11, s3, 0x4000c -; GFX8-NEXT: s_bfe_u32 s13, s3, 0x40010 -; GFX8-NEXT: s_bfe_u32 s15, s3, 0x40014 -; GFX8-NEXT: s_bfe_u32 s17, s3, 0x40018 -; GFX8-NEXT: s_lshr_b32 s3, s3, 28 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40004 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s17, s1, 0x40018 +; GFX8-NEXT: s_lshr_b32 s1, s1, 28 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 @@ -3007,41 +3151,44 @@ ; GFX8-NEXT: v_mad_u32_u24 v0, s15, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 ; GFX8-NEXT: v_mad_u32_u24 v2, s17, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_variant1: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, 15 -; GFX9-NEXT: s_and_b32 s5, s3, 15 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s16, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: s_and_b32 s4, s0, 15 +; GFX9-NEXT: s_and_b32 s5, s1, 15 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: v_mad_u32_u24 v0, s5, v0, v1 -; GFX9-NEXT: s_bfe_u32 s7, s3, 0x40004 -; GFX9-NEXT: s_bfe_u32 s9, s3, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40018 -; GFX9-NEXT: s_lshr_b32 s3, s3, 28 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x40004 +; GFX9-NEXT: s_bfe_u32 s9, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s17, s1, 0x40018 +; GFX9-NEXT: s_lshr_b32 s1, s1, 28 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mad_u32_u24 v0, s1, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3054,42 +3201,50 @@ ; GFX9-NEXT: v_mad_u32_u24 v0, s15, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: v_mad_u32_u24 v2, s17, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_variant1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_variant1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s0, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_nop 0 +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm i32 addrspace(1)* %v2addr, diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll --- a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll @@ -16,8 +16,12 @@ ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[0:1], v1, off ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -35,8 +39,12 @@ ; GFX10-NEXT: s_mov_b32 s4, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_short v[0:1], v1, off ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -53,8 +61,12 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3 ; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-UNPACKED-NEXT: s_nop 0 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1 ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 ; GFX8-UNPACKED-NEXT: s_endpgm @@ -79,8 +91,12 @@ ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[0:1], v1, off ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -98,8 +114,12 @@ ; GFX10-NEXT: s_mov_b32 s4, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_short v[0:1], v1, off ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -116,8 +136,12 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3 ; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-UNPACKED-NEXT: s_nop 0 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1 ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 ; GFX8-UNPACKED-NEXT: s_endpgm @@ -142,8 +166,12 @@ ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -161,8 +189,12 @@ ; GFX10-NEXT: s_mov_b32 s4, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -179,8 +211,12 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3 ; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-UNPACKED-NEXT: s_nop 0 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1 ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 ; GFX8-UNPACKED-NEXT: s_endpgm @@ -205,8 +241,12 @@ ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -224,8 +264,12 @@ ; GFX10-NEXT: s_mov_b32 s4, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -242,8 +286,12 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3 ; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-UNPACKED-NEXT: s_nop 0 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1 ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 ; GFX8-UNPACKED-NEXT: s_endpgm @@ -268,8 +316,12 @@ ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 unorm tfe d16 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -287,8 +339,12 @@ ; GFX10-NEXT: s_mov_b32 s4, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -306,10 +362,14 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-UNPACKED-NEXT: s_nop 0 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x3 unorm tfe d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-UNPACKED-NEXT: s_nop 0 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0 ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v3 ; GFX8-UNPACKED-NEXT: s_endpgm @@ -344,8 +404,12 @@ ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf unorm tfe d16 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[1:2], off ; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_endpgm @@ -364,8 +428,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[1:2], off ; GFX10-NEXT: global_store_dword v[0:1], v3, off ; GFX10-NEXT: s_endpgm @@ -385,12 +453,16 @@ ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-UNPACKED-NEXT: s_nop 0 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: image_load v[1:5], v0, s[4:11] dmask:0xf unorm tfe d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-UNPACKED-NEXT: s_nop 0 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[1:2] ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v5 ; GFX8-UNPACKED-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll --- a/llvm/test/CodeGen/AMDGPU/imm.ll +++ b/llvm/test/CodeGen/AMDGPU/imm.ll @@ -22,7 +22,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 5 ; VI-NEXT: v_mov_b32_e32 v1, 0x12345678 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm entry: @@ -50,7 +52,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x12345678 ; VI-NEXT: v_mov_b32_e32 v1, 5 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm entry: @@ -77,7 +81,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_bfrev_b32_e32 v1, 1 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm store i64 -9223372036854775808, i64 addrspace(1) *%out @@ -101,7 +107,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm store i32 -2147483648, i32 addrspace(1)* %out @@ -125,7 +133,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm store float 0.0, float addrspace(1)* %out @@ -149,7 +159,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm store float -0.0, float addrspace(1)* %out @@ -173,7 +185,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0.5 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm store float 0.5, float addrspace(1)* %out @@ -197,7 +211,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -0.5 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm store float -0.5, float addrspace(1)* %out @@ -221,7 +237,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm store float 1.0, float addrspace(1)* %out @@ -245,7 +263,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -1.0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm store float -1.0, float addrspace(1)* %out @@ -269,7 +289,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 2.0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm store float 2.0, float addrspace(1)* %out @@ -293,7 +315,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -2.0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm store float -2.0, float addrspace(1)* %out @@ -317,7 +341,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 4.0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm store float 4.0, float addrspace(1)* %out @@ -341,7 +367,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -4.0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm store float -4.0, float addrspace(1)* %out @@ -365,7 +393,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0.15915494 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm store float 0x3FC45F3060000000, float addrspace(1)* %out @@ -389,7 +419,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0xbe22f983 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm store float 0xBFC45F3060000000, float addrspace(1)* %out @@ -413,7 +445,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x45800000 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm store float 4096.0, float addrspace(1)* %out @@ -435,11 +469,13 @@ ; VI-LABEL: add_inline_imm_0.0_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 0 +; VI-NEXT: v_add_f32_e64 v0, s2, 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0.0 @@ -462,11 +498,13 @@ ; VI-LABEL: add_inline_imm_0.5_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 0.5 +; VI-NEXT: v_add_f32_e64 v0, s2, 0.5 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0.5 @@ -489,11 +527,13 @@ ; VI-LABEL: add_inline_imm_neg_0.5_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, -0.5 +; VI-NEXT: v_add_f32_e64 v0, s2, -0.5 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -0.5 @@ -516,11 +556,13 @@ ; VI-LABEL: add_inline_imm_1.0_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s2, 1.0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 1.0 @@ -543,11 +585,13 @@ ; VI-LABEL: add_inline_imm_neg_1.0_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, -1.0 +; VI-NEXT: v_add_f32_e64 v0, s2, -1.0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -1.0 @@ -570,11 +614,13 @@ ; VI-LABEL: add_inline_imm_2.0_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 2.0 +; VI-NEXT: v_add_f32_e64 v0, s2, 2.0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 2.0 @@ -597,11 +643,13 @@ ; VI-LABEL: add_inline_imm_neg_2.0_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, -2.0 +; VI-NEXT: v_add_f32_e64 v0, s2, -2.0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -2.0 @@ -624,11 +672,13 @@ ; VI-LABEL: add_inline_imm_4.0_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 4.0 +; VI-NEXT: v_add_f32_e64 v0, s2, 4.0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 4.0 @@ -651,11 +701,13 @@ ; VI-LABEL: add_inline_imm_neg_4.0_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, -4.0 +; VI-NEXT: v_add_f32_e64 v0, s2, -4.0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -4.0 @@ -694,9 +746,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, 0.5, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load float, float addrspace(1)* %in @@ -736,9 +792,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, 0x44800000, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load float, float addrspace(1)* %in @@ -762,11 +822,13 @@ ; VI-LABEL: add_inline_imm_1_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 1 +; VI-NEXT: v_add_f32_e64 v0, s2, 1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36a0000000000000 @@ -789,11 +851,13 @@ ; VI-LABEL: add_inline_imm_2_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 2 +; VI-NEXT: v_add_f32_e64 v0, s2, 2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36b0000000000000 @@ -816,11 +880,13 @@ ; VI-LABEL: add_inline_imm_16_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 16 +; VI-NEXT: v_add_f32_e64 v0, s2, 16 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36e0000000000000 @@ -844,12 +910,14 @@ ; VI-LABEL: add_inline_imm_neg_1_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s0, s0, -1 +; VI-NEXT: s_add_i32 s0, s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %xbc = bitcast float %x to i32 @@ -875,12 +943,14 @@ ; VI-LABEL: add_inline_imm_neg_2_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s0, s0, -2 +; VI-NEXT: s_add_i32 s0, s2, -2 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %xbc = bitcast float %x to i32 @@ -906,12 +976,14 @@ ; VI-LABEL: add_inline_imm_neg_16_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s0, s0, -16 +; VI-NEXT: s_add_i32 s0, s2, -16 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %xbc = bitcast float %x to i32 @@ -936,11 +1008,13 @@ ; VI-LABEL: add_inline_imm_63_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 63 +; VI-NEXT: v_add_f32_e64 v0, s2, 63 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36ff800000000000 @@ -963,11 +1037,13 @@ ; VI-LABEL: add_inline_imm_64_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 64 +; VI-NEXT: v_add_f32_e64 v0, s2, 64 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x3700000000000000 @@ -990,11 +1066,14 @@ ; VI-LABEL: add_inline_imm_0.0_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0.0 @@ -1017,11 +1096,14 @@ ; VI-LABEL: add_inline_imm_0.5_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.5 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0.5 @@ -1044,11 +1126,14 @@ ; VI-LABEL: add_inline_imm_neg_0.5_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], -0.5 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -0.5 @@ -1071,11 +1156,14 @@ ; VI-LABEL: add_inline_imm_1.0_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 1.0 @@ -1098,11 +1186,14 @@ ; VI-LABEL: add_inline_imm_neg_1.0_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], -1.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -1.0 @@ -1125,11 +1216,14 @@ ; VI-LABEL: add_inline_imm_2.0_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 2.0 @@ -1152,11 +1246,14 @@ ; VI-LABEL: add_inline_imm_neg_2.0_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], -2.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -2.0 @@ -1179,11 +1276,14 @@ ; VI-LABEL: add_inline_imm_4.0_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 4.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 4.0 @@ -1206,11 +1306,14 @@ ; VI-LABEL: add_inline_imm_neg_4.0_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], -4.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -4.0 @@ -1235,11 +1338,14 @@ ; VI-LABEL: add_inline_imm_inv_2pi_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.15915494309189532 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x3fc45f306dc9c882 @@ -1271,6 +1377,8 @@ ; VI-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0xbfc45f306dc9c882 @@ -1293,11 +1401,14 @@ ; VI-LABEL: add_inline_imm_1_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000001 @@ -1320,11 +1431,14 @@ ; VI-LABEL: add_inline_imm_2_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000002 @@ -1347,11 +1461,14 @@ ; VI-LABEL: add_inline_imm_16_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 16 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000010 @@ -1378,7 +1495,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0xffffffffffffffff @@ -1405,7 +1524,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -2 ; VI-NEXT: v_mov_b32_e32 v1, -1 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0xfffffffffffffffe @@ -1432,7 +1553,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -16 ; VI-NEXT: v_mov_b32_e32 v1, -1 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0xfffffffffffffff0 @@ -1455,11 +1578,14 @@ ; VI-LABEL: add_inline_imm_63_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 63 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x000000000000003F @@ -1482,11 +1608,14 @@ ; VI-LABEL: add_inline_imm_64_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 64 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000040 @@ -1513,7 +1642,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm store double 0.0, double addrspace(1)* %out @@ -1539,7 +1670,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_bfrev_b32_e32 v1, 1 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm store double -0.0, double addrspace(1)* %out @@ -1565,7 +1698,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0x3fe00000 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm store double 0.5, double addrspace(1)* %out @@ -1591,7 +1726,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0xbfe00000 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm store double -0.5, double addrspace(1)* %out @@ -1617,7 +1754,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm store double 1.0, double addrspace(1)* %out @@ -1643,7 +1782,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0xbff00000 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm store double -1.0, double addrspace(1)* %out @@ -1669,7 +1810,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm store double 2.0, double addrspace(1)* %out @@ -1695,7 +1838,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, -2.0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm store double -2.0, double addrspace(1)* %out @@ -1721,7 +1866,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0x40100000 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm store double 4.0, double addrspace(1)* %out @@ -1747,7 +1894,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0xc0100000 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm store double -4.0, double addrspace(1)* %out @@ -1773,7 +1922,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 ; VI-NEXT: v_mov_b32_e32 v1, 0x3fc45f30 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm store double 0x3fc45f306dc9c882, double addrspace(1)* %out @@ -1799,7 +1950,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 ; VI-NEXT: v_mov_b32_e32 v1, 0xbfc45f30 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm store double 0xbfc45f306dc9c882, double addrspace(1)* %out @@ -1825,7 +1978,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0x40b00000 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm store double 4096.0, double addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -11,7 +11,9 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -35,7 +37,9 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -59,7 +63,9 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -83,7 +89,9 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x3800 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -107,7 +115,9 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0xffffb800 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -131,7 +141,9 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x3c00 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -155,7 +167,9 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0xffffbc00 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -179,7 +193,9 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -203,7 +219,9 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0xffffc000 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -227,7 +245,9 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -251,7 +271,9 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0xffffc400 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -275,7 +297,9 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x3118 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -299,7 +323,9 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0xffffb118 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -323,7 +349,9 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x6c00 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -344,11 +372,13 @@ ; VI-LABEL: add_inline_imm_0.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f16_e64 v0, s4, 0 +; VI-NEXT: v_add_f16_e64 v0, s6, 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -373,11 +403,13 @@ ; VI-LABEL: add_inline_imm_0.5_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f16_e64 v0, s4, 0.5 +; VI-NEXT: v_add_f16_e64 v0, s6, 0.5 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -402,11 +434,13 @@ ; VI-LABEL: add_inline_imm_neg_0.5_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f16_e64 v0, s4, -0.5 +; VI-NEXT: v_add_f16_e64 v0, s6, -0.5 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -431,11 +465,13 @@ ; VI-LABEL: add_inline_imm_1.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f16_e64 v0, s4, 1.0 +; VI-NEXT: v_add_f16_e64 v0, s6, 1.0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -460,11 +496,13 @@ ; VI-LABEL: add_inline_imm_neg_1.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f16_e64 v0, s4, -1.0 +; VI-NEXT: v_add_f16_e64 v0, s6, -1.0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -489,11 +527,13 @@ ; VI-LABEL: add_inline_imm_2.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f16_e64 v0, s4, 2.0 +; VI-NEXT: v_add_f16_e64 v0, s6, 2.0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -518,11 +558,13 @@ ; VI-LABEL: add_inline_imm_neg_2.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f16_e64 v0, s4, -2.0 +; VI-NEXT: v_add_f16_e64 v0, s6, -2.0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -547,11 +589,13 @@ ; VI-LABEL: add_inline_imm_4.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f16_e64 v0, s4, 4.0 +; VI-NEXT: v_add_f16_e64 v0, s6, 4.0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -576,11 +620,13 @@ ; VI-LABEL: add_inline_imm_neg_4.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f16_e64 v0, s4, -4.0 +; VI-NEXT: v_add_f16_e64 v0, s6, -4.0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -614,9 +660,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_e32 v0, 0.5, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -658,9 +708,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_e32 v0, 0x6400, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -693,11 +747,13 @@ ; VI-LABEL: add_inline_imm_1_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f16_e64 v0, s4, 1 +; VI-NEXT: v_add_f16_e64 v0, s6, 1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -722,11 +778,13 @@ ; VI-LABEL: add_inline_imm_2_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f16_e64 v0, s4, 2 +; VI-NEXT: v_add_f16_e64 v0, s6, 2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -751,11 +809,13 @@ ; VI-LABEL: add_inline_imm_16_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f16_e64 v0, s4, 16 +; VI-NEXT: v_add_f16_e64 v0, s6, 16 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -789,9 +849,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, -1, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -832,9 +896,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, -2, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -875,9 +943,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, -16, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -909,11 +981,13 @@ ; VI-LABEL: add_inline_imm_63_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f16_e64 v0, s4, 63 +; VI-NEXT: v_add_f16_e64 v0, s6, 63 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -938,11 +1012,13 @@ ; VI-LABEL: add_inline_imm_64_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f16_e64 v0, s4, 64 +; VI-NEXT: v_add_f16_e64 v0, s6, 64 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s ; FIXME: Merge into imm.ll diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -393,6 +393,7 @@ ; GCN-LABEL: {{^}}extract_vgpr_offset_multiple_in_block: ; FIXME: Why is vector copied in between? +; FIXME: Why is load scheduled late with XNACK on. ; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] ; GCN-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9 @@ -400,14 +401,14 @@ ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]] ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]] -; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec +; GCN-DAG: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec -; GCN: s_waitcnt vmcnt(0) -; PREGFX9: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], vcc, 1, [[IDX0]] -; GFX9: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], 1, [[IDX0]] +; GCN-DAG: s_waitcnt vmcnt(0) +; PREGFX9-DAG: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], vcc, 1, [[IDX0]] +; GFX9-DAG: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], 1, [[IDX0]] -; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: +; GCN-DAG: [[LOOP0:BB[0-9]+_[0-9]+]]: ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] ; GCN: s_and_saveexec_b64 vcc, vcc diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -28,15 +28,17 @@ ; VI-LABEL: insertelement_v4f32_0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, 0x40a00000 +; VI-NEXT: s_mov_b32 s8, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 @@ -63,15 +65,17 @@ ; VI-LABEL: insertelement_v4f32_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s5, 0x40a00000 +; VI-NEXT: s_mov_b32 s9, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 @@ -98,15 +102,17 @@ ; VI-LABEL: insertelement_v4f32_2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s6, 0x40a00000 +; VI-NEXT: s_mov_b32 s10, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 @@ -133,15 +139,17 @@ ; VI-LABEL: insertelement_v4f32_3: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s7, 0x40a00000 +; VI-NEXT: s_mov_b32 s11, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 @@ -168,15 +176,17 @@ ; VI-LABEL: insertelement_v4i32_0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_movk_i32 s4, 0x3e7 +; VI-NEXT: s_movk_i32 s8, 0x3e7 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x i32> %a, i32 999, i32 0 @@ -201,13 +211,15 @@ ; VI-LABEL: insertelement_v3f32_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1 @@ -232,13 +244,15 @@ ; VI-LABEL: insertelement_v3f32_2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2 @@ -256,23 +270,43 @@ } define <4 x float> @insertelement_to_sgpr() nounwind { -; GCN-LABEL: insertelement_to_sgpr: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s12, 0 -; GCN-NEXT: s_mov_b32 s4, s12 -; GCN-NEXT: s_mov_b32 s5, s12 -; GCN-NEXT: s_mov_b32 s6, s12 -; GCN-NEXT: s_mov_b32 s7, s12 -; GCN-NEXT: s_mov_b32 s8, s12 -; GCN-NEXT: s_mov_b32 s9, s12 -; GCN-NEXT: s_mov_b32 s10, s12 -; GCN-NEXT: s_mov_b32 s11, s12 -; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: insertelement_to_sgpr: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, 0 +; SI-NEXT: s_mov_b32 s4, s12 +; SI-NEXT: s_mov_b32 s5, s12 +; SI-NEXT: s_mov_b32 s6, s12 +; SI-NEXT: s_mov_b32 s7, s12 +; SI-NEXT: s_mov_b32 s8, s12 +; SI-NEXT: s_mov_b32 s9, s12 +; SI-NEXT: s_mov_b32 s10, s12 +; SI-NEXT: s_mov_b32 s11, s12 +; SI-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: insertelement_to_sgpr: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, 0 +; VI-NEXT: s_mov_b32 s4, s12 +; VI-NEXT: s_mov_b32 s5, s12 +; VI-NEXT: s_mov_b32 s6, s12 +; VI-NEXT: s_mov_b32 s7, s12 +; VI-NEXT: s_mov_b32 s8, s12 +; VI-NEXT: s_mov_b32 s9, s12 +; VI-NEXT: s_mov_b32 s10, s12 +; VI-NEXT: s_mov_b32 s11, s12 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0 %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0) @@ -302,17 +336,19 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s8, s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s8, 1 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s8, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b @@ -344,23 +380,25 @@ ; ; VI-LABEL: dynamic_insertelement_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s4, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x20 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s6, 2 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s6, 1 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s8 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s6, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 @@ -394,26 +432,28 @@ ; ; VI-LABEL: dynamic_insertelement_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s4, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x20 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s11 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s6, 3 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s6, 2 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s6, 1 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s6, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 @@ -462,12 +502,13 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v5, s15 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s10 @@ -479,7 +520,6 @@ ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-NEXT: v_mov_b32_e32 v5, s15 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 ; VI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc ; VI-NEXT: v_mov_b32_e32 v5, s14 @@ -491,6 +531,8 @@ ; VI-NEXT: v_mov_b32_e32 v8, s12 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -537,7 +579,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dword s6, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -558,8 +600,10 @@ ; VI-NEXT: v_mov_b32_e32 v13, s21 ; VI-NEXT: v_mov_b32_e32 v14, s22 ; VI-NEXT: v_mov_b32_e32 v15, s23 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: s_mov_b32 m0, s6 ; VI-NEXT: v_movreld_b32_e32 v0, v16 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -592,16 +636,18 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s8, s[4:5], 0x10 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s8, 1 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s8, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i32> %a, i32 5, i32 %b @@ -632,22 +678,25 @@ ; ; VI-LABEL: dynamic_insertelement_v3i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s4, s[4:5], 0x20 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; VI-NEXT: s_load_dword s3, s[4:5], 0x20 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s3, 2 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s3, 1 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s3, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x i32> %a, i32 5, i32 %b store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16 @@ -682,27 +731,29 @@ ; ; VI-LABEL: dynamic_insertelement_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_load_dword s6, s[4:5], 0x20 -; VI-NEXT: s_load_dword s4, s[4:5], 0x44 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dword s7, s[4:5], 0x44 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc -; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc -; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 1 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 @@ -750,11 +801,12 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v4, s15 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 ; VI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s10 @@ -766,7 +818,6 @@ ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v4, s15 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 ; VI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, s14 @@ -778,6 +829,8 @@ ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -823,7 +876,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dword s6, s[4:5], 0x80 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -843,8 +896,10 @@ ; VI-NEXT: v_mov_b32_e32 v13, s21 ; VI-NEXT: v_mov_b32_e32 v14, s22 ; VI-NEXT: v_mov_b32_e32 v15, s23 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: s_mov_b32 m0, s6 ; VI-NEXT: v_movreld_b32_e32 v0, 5 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -876,15 +931,17 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 -; VI-NEXT: s_load_dword s4, s[4:5], 0xc +; VI-NEXT: s_load_dword s7, s[4:5], 0xc ; VI-NEXT: v_mov_b32_e32 v0, 0x50005 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: s_lshl_b32 s4, s4, 4 +; VI-NEXT: s_lshl_b32 s4, s7, 4 ; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i16> %a, i16 5, i32 %b @@ -920,21 +977,23 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s8, s[4:5], 0x10 ; VI-NEXT: s_mov_b32 s5, 0 +; VI-NEXT: s_mov_b32 s4, 0xffff ; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_lshl_b32 s8, s4, 4 -; VI-NEXT: s_mov_b32 s4, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 4 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 ; VI-NEXT: s_mov_b32 s8, 0x50005 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_bfi_b32 v0, s5, v0, v1 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, s8 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_bfi_b32 v1, s4, v1, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -964,16 +1023,18 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s6, s[4:5], 0x28 -; VI-NEXT: s_load_dword s4, s[4:5], 0x4c +; VI-NEXT: s_load_dword s7, s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 3 +; VI-NEXT: s_lshl_b32 s4, s7, 3 ; VI-NEXT: v_lshlrev_b16_e64 v0, s4, -1 ; VI-NEXT: v_and_b32_e32 v1, 0x505, v0 ; VI-NEXT: v_xor_b32_e32 v0, -1, v0 ; VI-NEXT: v_and_b32_e32 v0, s6, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i8> %a, i8 5, i32 %b @@ -1006,16 +1067,18 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s6, s[4:5], 0x28 -; VI-NEXT: s_load_dword s4, s[4:5], 0x4c +; VI-NEXT: s_load_dword s7, s[4:5], 0x4c ; VI-NEXT: v_mov_b32_e32 v0, 0x5050505 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: s_lshl_b32 s4, s4, 3 +; VI-NEXT: s_lshl_b32 s4, s7, 3 ; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 ; VI-NEXT: s_endpgm @@ -1045,15 +1108,17 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s6, s[4:5], 0x28 -; VI-NEXT: s_load_dword s4, s[4:5], 0x4c +; VI-NEXT: s_load_dword s7, s[4:5], 0x4c ; VI-NEXT: v_mov_b32_e32 v0, 0x5050505 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: s_lshl_b32 s4, s4, 3 +; VI-NEXT: s_lshl_b32 s4, s7, 3 ; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x i8> %a, i8 5, i32 %b @@ -1109,6 +1174,8 @@ ; VI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4 @@ -1231,8 +1298,8 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s4, s[4:5], 0x20 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_load_dword s4, s[4:5], 0x20 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s11, 24 @@ -1315,6 +1382,8 @@ ; VI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc ; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <16 x i8> %a, i8 5, i32 %b @@ -1349,6 +1418,7 @@ ; VI-LABEL: insert_split_bb: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dword s0, s[4:5], 0x10 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s0, 0 @@ -1364,6 +1434,8 @@ ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm entry: @@ -1413,24 +1485,26 @@ ; ; VI-LABEL: dynamic_insertelement_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x30 -; VI-NEXT: s_load_dword s4, s[4:5], 0x60 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30 ; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dword s4, s[4:5], 0x60 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x double> %a, double 8.0, i32 %b store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16 @@ -1461,23 +1535,25 @@ ; ; VI-LABEL: dynamic_insertelement_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_load_dword s6, s[4:5], 0x20 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 ; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 -; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s6, 0 +; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[2:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i64> %a, i64 5, i32 %b store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8 @@ -1535,6 +1611,8 @@ ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1581,12 +1659,13 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: v_mov_b32_e32 v4, 0x40200000 +; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v5, s15 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v0, s10 @@ -1596,7 +1675,6 @@ ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-NEXT: v_mov_b32_e32 v5, s15 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3 ; VI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc ; VI-NEXT: v_mov_b32_e32 v5, s14 @@ -1606,6 +1684,8 @@ ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1665,15 +1745,15 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x40 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v16, 64 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: s_mov_b32 s11, 0x1100f000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: s_and_b32 s4, s4, 7 ; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v12, s24 ; VI-NEXT: v_mov_b32_e32 v13, s25 ; VI-NEXT: v_mov_b32_e32 v14, s26 @@ -1688,6 +1768,8 @@ ; VI-NEXT: v_mov_b32_e32 v9, s21 ; VI-NEXT: v_mov_b32_e32 v10, s22 ; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80 @@ -1695,12 +1777,18 @@ ; VI-NEXT: v_or_b32_e32 v16, s4, v16 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80 ; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96 ; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -8,28 +8,46 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_lh_b32_b16 s0, 0x3e7, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; -; CIVI-LABEL: s_insertelement_v2i16_0: -; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: v_mov_b32_e32 v1, s1 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000 -; CIVI-NEXT: s_or_b32 s0, s0, 0x3e7 -; CIVI-NEXT: v_mov_b32_e32 v2, s0 -; CIVI-NEXT: flat_store_dword v[0:1], v2 -; CIVI-NEXT: s_endpgm +; VI-LABEL: s_insertelement_v2i16_0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s0, s2, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s0, 0x3e7 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; CI-LABEL: s_insertelement_v2i16_0: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 +; CI-NEXT: s_or_b32 s0, s0, 0x3e7 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out @@ -41,30 +59,34 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s0, s4, s2 +; GFX9-NEXT: s_pack_lh_b32_b16 s0, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_0_reg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_and_b32 s0, s4, 0xffff +; VI-NEXT: s_and_b32 s0, s6, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 ; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -93,15 +115,17 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s0 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s0 @@ -111,17 +135,19 @@ ; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_and_b32 s0, s4, 0xffff +; VI-NEXT: s_and_b32 s0, s6, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s2, 16 ; VI-NEXT: s_and_b32 s2, s2, 0xffff0000 ; VI-NEXT: s_or_b32 s0, s0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART ; VI-NEXT: ; use s1 @@ -160,30 +186,34 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s0, s4, s2 +; GFX9-NEXT: s_pack_hh_b32_b16 s0, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_0_reghi: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: s_lshr_b32 s0, s6, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 ; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -214,15 +244,17 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_lshr_b32 s0, s6, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_lh_b32_b16 s1, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s0 @@ -232,16 +264,18 @@ ; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: s_lshr_b32 s0, s6, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 ; VI-NEXT: s_or_b32 s1, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART ; VI-NEXT: ; use s0 @@ -280,16 +314,18 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: s_lshr_b32 s0, s6, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s1, s2, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s0 @@ -302,17 +338,19 @@ ; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: s_lshr_b32 s0, s6, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s2, 16 ; VI-NEXT: s_and_b32 s2, s2, 0xffff0000 ; VI-NEXT: s_or_b32 s2, s0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART ; VI-NEXT: ; use s0 @@ -363,28 +401,46 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, 0x3e7 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; -; CIVI-LABEL: s_insertelement_v2i16_1: -; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: v_mov_b32_e32 v1, s1 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_and_b32 s0, s2, 0xffff -; CIVI-NEXT: s_or_b32 s0, s0, 0x3e70000 -; CIVI-NEXT: v_mov_b32_e32 v2, s0 -; CIVI-NEXT: flat_store_dword v[0:1], v2 -; CIVI-NEXT: s_endpgm +; VI-LABEL: s_insertelement_v2i16_1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s0, s2, 0xffff +; VI-NEXT: s_or_b32 s0, s0, 0x3e70000 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; CI-LABEL: s_insertelement_v2i16_1: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_and_b32 s0, s2, 0xffff +; CI-NEXT: s_or_b32 s0, s0, 0x3e70000 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out @@ -395,30 +451,34 @@ ; GFX9-LABEL: s_insertelement_v2i16_1_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_1_reg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshl_b32 s0, s4, 16 +; VI-NEXT: s_lshl_b32 s0, s6, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s1, s2, 0xffff ; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -448,29 +508,47 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s0, s2, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, 0x4500, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; -; CIVI-LABEL: s_insertelement_v2f16_0: -; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: v_mov_b32_e32 v1, s1 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000 -; CIVI-NEXT: s_or_b32 s0, s0, 0x4500 -; CIVI-NEXT: v_mov_b32_e32 v2, s0 -; CIVI-NEXT: flat_store_dword v[0:1], v2 -; CIVI-NEXT: s_endpgm +; VI-LABEL: s_insertelement_v2f16_0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s0, s2, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s0, 0x4500 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; CI-LABEL: s_insertelement_v2f16_0: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 +; CI-NEXT: s_or_b32 s0, s0, 0x4500 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 store <2 x half> %vecins, <2 x half> addrspace(1)* %out @@ -482,28 +560,46 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, 0x4500 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; -; CIVI-LABEL: s_insertelement_v2f16_1: -; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: v_mov_b32_e32 v1, s1 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_and_b32 s0, s2, 0xffff -; CIVI-NEXT: s_or_b32 s0, s0, 0x45000000 -; CIVI-NEXT: v_mov_b32_e32 v2, s0 -; CIVI-NEXT: flat_store_dword v[0:1], v2 -; CIVI-NEXT: s_endpgm +; VI-LABEL: s_insertelement_v2f16_1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s0, s2, 0xffff +; VI-NEXT: s_or_b32 s0, s0, 0x45000000 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; CI-LABEL: s_insertelement_v2f16_1: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_and_b32 s0, s2, 0xffff +; CI-NEXT: s_or_b32 s0, s0, 0x45000000 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 store <2 x half> %vecins, <2 x half> addrspace(1)* %out @@ -521,12 +617,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v2, v3, s4, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -538,6 +638,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -545,6 +647,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; VI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -579,40 +683,48 @@ ; GFX9-LABEL: v_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s4 +; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s6 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0_reghi: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: s_lshr_b32 s1, s4, 16 +; VI-NEXT: s_lshr_b32 s1, s6, 16 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; VI-NEXT: v_or_b32_e32 v2, s1, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -657,12 +769,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v2, v3, 53, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -674,6 +790,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -681,6 +799,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; VI-NEXT: v_or_b32_e32 v2, 53, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -722,6 +842,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -729,6 +851,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v2, s4, 16, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -741,12 +865,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -786,6 +914,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -793,6 +923,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v2, -15, 16, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -805,12 +937,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -851,6 +987,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -858,6 +996,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -869,6 +1009,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -876,6 +1018,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; VI-NEXT: v_or_b32_e32 v2, 0x4500, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -915,6 +1059,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -922,6 +1068,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, 53 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -933,6 +1081,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -940,6 +1090,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; VI-NEXT: v_or_b32_e32 v2, 53, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -980,6 +1132,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -987,6 +1141,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v2, s4, 16, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -999,12 +1155,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1044,6 +1204,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1051,6 +1213,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v2, 35, 16, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1063,12 +1227,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1104,36 +1272,40 @@ ; GFX9-LABEL: s_insertelement_v2i16_dynamic: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 ; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_dynamic: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: s_load_dword s0, s[8:9], 0x0 ; VI-NEXT: s_load_dword s1, s[2:3], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s0, s0, 4 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1165,42 +1337,50 @@ ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_lshl_b32 s2, s6, 4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: s_lshl_b32 s2, s4, 4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v2, s0, v3, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3e703e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshl_b32 s2, s6, 4 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_lshl_b32 s2, s4, 4 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_bfi_b32 v2, s0, v3, v4 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1238,54 +1418,66 @@ ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: s_mov_b32 s5, 0x12341234 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, 0xffff -; GFX9-NEXT: s_mov_b32 s7, 0x12341234 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v4 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s6 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, v2, s7, v3 +; GFX9-NEXT: v_bfi_b32 v2, v2, s5, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_mov_b32 s4, 0xffff +; VI-NEXT: s_mov_b32 s5, 0x12341234 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s6, 0xffff -; VI-NEXT: s_mov_b32 s7, 0x12341234 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 -; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s6 +; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_bfi_b32 v2, v2, s7, v3 +; VI-NEXT: v_bfi_b32 v2, v2, s5, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1330,39 +1522,47 @@ ; GFX9-LABEL: v_insertelement_v4f16_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v0, v4, s4, v0 +; GFX9-NEXT: v_bfi_b32 v0, v4, s6, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: s_and_b32 s1, s6, 0xffff ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; VI-NEXT: v_or_b32_e32 v0, s1, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1401,39 +1601,47 @@ ; GFX9-LABEL: v_insertelement_v4f16_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, s4, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, s6, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_lshl_b32 s2, s4, 16 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_lshl_b32 s2, s6, 16 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1472,39 +1680,47 @@ ; GFX9-LABEL: v_insertelement_v4f16_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v4, s4, v1 +; GFX9-NEXT: v_bfi_b32 v1, v4, s6, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: s_and_b32 s1, s6, 0xffff ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_or_b32_e32 v1, s1, v1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1543,39 +1759,47 @@ ; GFX9-LABEL: v_insertelement_v4f16_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_3: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_lshl_b32 s2, s4, 16 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_lshl_b32 s2, s6, 16 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1614,39 +1838,47 @@ ; GFX9-LABEL: v_insertelement_v4i16_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v4, s4, v1 +; GFX9-NEXT: v_bfi_b32 v1, v4, s6, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4i16_2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: s_and_b32 s1, s6, 0xffff ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_or_b32_e32 v1, s1, v1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1694,18 +1926,22 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s6, s6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, v5, s1, v1 ; GFX9-NEXT: v_bfi_b32 v0, v4, s1, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -1720,7 +1956,6 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_mov_b32 s5, 0 ; VI-NEXT: s_and_b32 s1, s6, s4 @@ -1728,12 +1963,17 @@ ; VI-NEXT: s_lshl_b32 s0, s1, 16 ; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_bfi_b32 v1, v5, s0, v1 ; VI-NEXT: v_bfi_b32 v0, v4, s0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1781,54 +2021,62 @@ ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: s_mov_b32 s7, 0 -; GFX9-NEXT: s_mov_b32 s6, 0xffff +; GFX9-NEXT: s_mov_b32 s5, 0 +; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s4 -; GFX9-NEXT: s_lshl_b32 s2, s5, 4 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s6 +; GFX9-NEXT: s_lshl_b32 s2, s7, 4 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], s2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[4:5], s2 ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_mov_b32_e32 v5, s3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s1, v4, v1 ; GFX9-NEXT: v_bfi_b32 v0, s0, v5, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_mov_b32 s6, 0xffff -; VI-NEXT: s_mov_b32 s7, 0 +; VI-NEXT: s_mov_b32 s4, 0xffff +; VI-NEXT: s_mov_b32 s5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_and_b32 s2, s4, s6 +; VI-NEXT: s_and_b32 s2, s6, s4 ; VI-NEXT: s_lshl_b32 s3, s2, 16 ; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_lshl_b32 s4, s5, 4 +; VI-NEXT: s_lshl_b32 s6, s7, 4 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], s4 +; VI-NEXT: s_lshl_b64 s[0:1], s[4:5], s6 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_bfi_b32 v1, s1, v4, v1 ; VI-NEXT: v_bfi_b32 v0, s0, v5, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir b/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir --- a/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir @@ -229,7 +229,7 @@ # GCN-LABEL: name: hazard_lds_branch_global # GCN: S_WAITCNT_VSCNT undef $sgpr_null, 0 -# GCN-NEXT: GLOBAL_LOAD_DWORD +# GCN: GLOBAL_LOAD_DWORD --- name: hazard_lds_branch_global body: | @@ -261,6 +261,7 @@ # GCN-LABEL: name: no_hazard_lds_branch_flat # GCN: bb.1: +# GCN-NEXT: S_NOP # GCN-NEXT: FLAT_LOAD_DWORD --- name: no_hazard_lds_branch_flat diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -1,5 +1,5 @@ ;RUN: llc < %s -march=amdgcn -mcpu=verde -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI -;RUN: llc < %s -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-xnack -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI ;CHECK-LABEL: {{^}}test1: ;CHECK-NOT: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -445,7 +445,7 @@ ; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset: ; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} -; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen +; CHECK: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) { %alloca = alloca i32, addrspace(5) %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -19,24 +19,28 @@ ; VI-LABEL: s_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v2 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) @@ -59,22 +63,26 @@ ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, s4 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, s4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) @@ -114,44 +122,52 @@ ; VI-LABEL: v_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v6 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v6 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -191,12 +207,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -208,12 +228,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -251,12 +275,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -268,12 +296,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -309,44 +341,52 @@ ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v6 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v6 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -385,44 +425,52 @@ ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v6 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v6 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -461,44 +509,52 @@ ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v6 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v6 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -538,44 +594,52 @@ ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v6 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v6 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll @@ -166,7 +166,7 @@ ; NOLOOP: s_mov_b32 m0, 0{{$}} ; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds ; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; NOLOOP-NEXT: load_dword +; NOLOOP: load_dword define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) %load = load volatile i32, i32 addrspace(1)* %ptr @@ -194,7 +194,7 @@ ; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; NOLOOP-NEXT: load_dword +; NOLOOP: load_dword define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) fence release diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll @@ -304,7 +304,7 @@ ; SI-NEXT: s_mov_b32 s{{[0-9]+}}, -1 ; GCN-NEXT: v_mov_b32_e32 ; GCN-NEXT: v_mov_b32_e32 -; GCN-NEXT: {{global|flat|buffer}}_store_dwordx2 +; GCN: {{global|flat|buffer}}_store_dwordx2 define amdgpu_kernel void @v_icmp_i1_ne0(i64 addrspace(1)* %out, i32 %a, i32 %b) { %c0 = icmp ugt i32 %a, 1 %c1 = icmp ugt i32 %b, 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -66,10 +66,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v2, v0 ; FIJI-NEXT: v_mov_b32_e32 v3, v0 ; FIJI-NEXT: v_mov_b32_e32 v4, v0 -; FIJI-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf unorm tfe ; FIJI-NEXT: s_mov_b32 s11, 0xf000 ; FIJI-NEXT: s_mov_b32 s10, -1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf unorm tfe +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog @@ -82,10 +86,14 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: v_mov_b32_e32 v6, s9 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf unorm tfe ; GFX6789-NEXT: v_mov_b32_e32 v5, s8 -; GFX6789-NEXT: v_mov_b32_e32 v6, s9 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[5:6], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -93,10 +101,14 @@ ; NOPRT-LABEL: load_1d_tfe: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 -; NOPRT-NEXT: image_load v[0:4], v0, s[0:7] dmask:0xf unorm tfe ; NOPRT-NEXT: v_mov_b32_e32 v5, s8 ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: image_load v[0:4], v0, s[0:7] dmask:0xf unorm tfe +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: global_store_dword v[5:6], v4, off ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog @@ -111,9 +123,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -150,10 +166,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v2, v0 ; FIJI-NEXT: v_mov_b32_e32 v3, v0 ; FIJI-NEXT: v_mov_b32_e32 v4, v0 -; FIJI-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf unorm lwe ; FIJI-NEXT: s_mov_b32 s11, 0xf000 ; FIJI-NEXT: s_mov_b32 s10, -1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf unorm lwe +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog @@ -166,10 +186,14 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: v_mov_b32_e32 v6, s9 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf unorm lwe ; GFX6789-NEXT: v_mov_b32_e32 v5, s8 -; GFX6789-NEXT: v_mov_b32_e32 v6, s9 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[5:6], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -177,10 +201,14 @@ ; NOPRT-LABEL: load_1d_lwe: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 -; NOPRT-NEXT: image_load v[0:4], v0, s[0:7] dmask:0xf unorm lwe ; NOPRT-NEXT: v_mov_b32_e32 v5, s8 ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: image_load v[0:4], v0, s[0:7] dmask:0xf unorm lwe +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: global_store_dword v[5:6], v4, off ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog @@ -195,9 +223,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -272,10 +304,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v2, v0 ; FIJI-NEXT: v_mov_b32_e32 v3, v0 ; FIJI-NEXT: v_mov_b32_e32 v4, v0 -; FIJI-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm tfe ; FIJI-NEXT: s_mov_b32 s11, 0xf000 ; FIJI-NEXT: s_mov_b32 s10, -1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm tfe +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog @@ -289,10 +325,14 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm tfe ; GFX6789-NEXT: v_mov_b32_e32 v5, s8 ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[5:6], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -300,10 +340,14 @@ ; NOPRT-LABEL: load_2d_tfe: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 -; NOPRT-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm tfe ; NOPRT-NEXT: v_mov_b32_e32 v5, s8 ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm tfe +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: global_store_dword v[5:6], v4, off ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog @@ -319,9 +363,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -398,10 +446,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v2, v0 ; FIJI-NEXT: v_mov_b32_e32 v3, v0 ; FIJI-NEXT: v_mov_b32_e32 v4, v0 -; FIJI-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe lwe ; FIJI-NEXT: s_mov_b32 s11, 0xf000 ; FIJI-NEXT: s_mov_b32 s10, -1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe lwe +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog @@ -416,10 +468,14 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe lwe ; GFX6789-NEXT: v_mov_b32_e32 v5, s8 ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[5:6], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -427,10 +483,14 @@ ; NOPRT-LABEL: load_3d_tfe_lwe: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 -; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe lwe ; NOPRT-NEXT: v_mov_b32_e32 v5, s8 ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe lwe +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: global_store_dword v[5:6], v4, off ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog @@ -447,9 +507,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] -; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; encoding: [0x10,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; encoding: [0x10,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -526,10 +590,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v2, v0 ; FIJI-NEXT: v_mov_b32_e32 v3, v0 ; FIJI-NEXT: v_mov_b32_e32 v4, v0 -; FIJI-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm lwe da ; FIJI-NEXT: s_mov_b32 s11, 0xf000 ; FIJI-NEXT: s_mov_b32 s10, -1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm lwe da +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog @@ -544,10 +612,14 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm lwe da ; GFX6789-NEXT: v_mov_b32_e32 v5, s8 ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[5:6], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -555,10 +627,14 @@ ; NOPRT-LABEL: load_cube_lwe: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 -; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm lwe da ; NOPRT-NEXT: v_mov_b32_e32 v5, s8 ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm lwe da +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: global_store_dword v[5:6], v4, off ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog @@ -575,9 +651,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] -; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; encoding: [0x18,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; encoding: [0x18,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -652,10 +732,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v2, v0 ; FIJI-NEXT: v_mov_b32_e32 v3, v0 ; FIJI-NEXT: v_mov_b32_e32 v4, v0 -; FIJI-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm tfe da ; FIJI-NEXT: s_mov_b32 s11, 0xf000 ; FIJI-NEXT: s_mov_b32 s10, -1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm tfe da +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog @@ -669,10 +753,14 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm tfe da ; GFX6789-NEXT: v_mov_b32_e32 v5, s8 ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[5:6], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -680,10 +768,14 @@ ; NOPRT-LABEL: load_1darray_tfe: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 -; NOPRT-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm tfe da ; NOPRT-NEXT: v_mov_b32_e32 v5, s8 ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm tfe da +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: global_store_dword v[5:6], v4, off ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog @@ -699,9 +791,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; encoding: [0x20,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -778,10 +874,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v2, v0 ; FIJI-NEXT: v_mov_b32_e32 v3, v0 ; FIJI-NEXT: v_mov_b32_e32 v4, v0 -; FIJI-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm lwe da ; FIJI-NEXT: s_mov_b32 s11, 0xf000 ; FIJI-NEXT: s_mov_b32 s10, -1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm lwe da +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog @@ -796,10 +896,14 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm lwe da ; GFX6789-NEXT: v_mov_b32_e32 v5, s8 ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[5:6], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -807,10 +911,14 @@ ; NOPRT-LABEL: load_2darray_lwe: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 -; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm lwe da ; NOPRT-NEXT: v_mov_b32_e32 v5, s8 ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm lwe da +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: global_store_dword v[5:6], v4, off ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog @@ -827,9 +935,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] -; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; encoding: [0x28,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; encoding: [0x28,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -906,10 +1018,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v2, v0 ; FIJI-NEXT: v_mov_b32_e32 v3, v0 ; FIJI-NEXT: v_mov_b32_e32 v4, v0 -; FIJI-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe lwe ; FIJI-NEXT: s_mov_b32 s11, 0xf000 ; FIJI-NEXT: s_mov_b32 s10, -1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe lwe +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog @@ -924,10 +1040,14 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe lwe ; GFX6789-NEXT: v_mov_b32_e32 v5, s8 ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[5:6], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -935,10 +1055,14 @@ ; NOPRT-LABEL: load_2dmsaa_both: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 -; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe lwe ; NOPRT-NEXT: v_mov_b32_e32 v5, s8 ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe lwe +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: global_store_dword v[5:6], v4, off ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog @@ -955,9 +1079,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] -; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x30,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x30,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -1036,10 +1164,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v2, v0 ; FIJI-NEXT: v_mov_b32_e32 v3, v0 ; FIJI-NEXT: v_mov_b32_e32 v4, v0 -; FIJI-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf unorm tfe da ; FIJI-NEXT: s_mov_b32 s11, 0xf000 ; FIJI-NEXT: s_mov_b32 s10, -1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf unorm tfe da +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog @@ -1055,10 +1187,14 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf unorm tfe da ; GFX6789-NEXT: v_mov_b32_e32 v5, s8 ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[5:6], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1066,10 +1202,14 @@ ; NOPRT-LABEL: load_2darraymsaa_tfe: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 -; NOPRT-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf unorm tfe da ; NOPRT-NEXT: v_mov_b32_e32 v5, s8 ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf unorm tfe da +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: global_store_dword v[5:6], v4, off ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog @@ -1086,10 +1226,14 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] -; GFX10-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x38,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x38,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -1164,10 +1308,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v2, v0 ; FIJI-NEXT: v_mov_b32_e32 v3, v0 ; FIJI-NEXT: v_mov_b32_e32 v4, v0 -; FIJI-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf unorm lwe ; FIJI-NEXT: s_mov_b32 s11, 0xf000 ; FIJI-NEXT: s_mov_b32 s10, -1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf unorm lwe +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog @@ -1181,10 +1329,14 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf unorm lwe ; GFX6789-NEXT: v_mov_b32_e32 v5, s8 ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[5:6], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1192,10 +1344,14 @@ ; NOPRT-LABEL: load_mip_1d_lwe: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 -; NOPRT-NEXT: image_load_mip v[0:4], v[0:1], s[0:7] dmask:0xf unorm lwe ; NOPRT-NEXT: v_mov_b32_e32 v5, s8 ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: image_load_mip v[0:4], v[0:1], s[0:7] dmask:0xf unorm lwe +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: global_store_dword v[5:6], v4, off ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog @@ -1211,9 +1367,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x06,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -1290,10 +1450,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v2, v0 ; FIJI-NEXT: v_mov_b32_e32 v3, v0 ; FIJI-NEXT: v_mov_b32_e32 v4, v0 -; FIJI-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe ; FIJI-NEXT: s_mov_b32 s11, 0xf000 ; FIJI-NEXT: s_mov_b32 s10, -1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog @@ -1308,10 +1472,14 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe ; GFX6789-NEXT: v_mov_b32_e32 v5, s8 ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[5:6], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1319,10 +1487,14 @@ ; NOPRT-LABEL: load_mip_2d_tfe: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 -; NOPRT-NEXT: image_load_mip v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe ; NOPRT-NEXT: v_mov_b32_e32 v5, s8 ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: image_load_mip v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: global_store_dword v[5:6], v4, off ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog @@ -1339,9 +1511,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] -; GFX10-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x05,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x05,0xf0,0x05,0x00,0x00,0x00] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -1367,6 +1543,8 @@ ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: v_mov_b32_e32 v1, 0 ; FIJI-NEXT: v_mov_b32_e32 v2, v1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, v2 @@ -1376,6 +1554,8 @@ ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v1, 0 ; GFX6789-NEXT: v_mov_b32_e32 v2, v1 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: v_mov_b32_e32 v0, v2 @@ -1384,6 +1564,8 @@ ; NOPRT-LABEL: load_1d_V2_tfe_dmask0: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v1, 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: v_mov_b32_e32 v0, v1 @@ -1394,6 +1576,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; encoding: [0x80,0x02,0x02,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; encoding: [0x01,0x03,0x04,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x11,0x01,0xf0,0x00,0x01,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; encoding: [0x02,0x03,0x00,0x7e] @@ -1419,6 +1603,8 @@ ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: v_mov_b32_e32 v1, 0 ; FIJI-NEXT: v_mov_b32_e32 v2, v1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, v2 @@ -1428,6 +1614,8 @@ ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v1, 0 ; GFX6789-NEXT: v_mov_b32_e32 v2, v1 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: v_mov_b32_e32 v0, v2 @@ -1436,6 +1624,8 @@ ; NOPRT-LABEL: load_1d_V1_tfe_dmask0: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v1, 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: v_mov_b32_e32 v0, v1 @@ -1446,6 +1636,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; encoding: [0x80,0x02,0x02,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; encoding: [0x01,0x03,0x04,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x11,0x01,0xf0,0x00,0x01,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; encoding: [0x02,0x03,0x00,0x7e] @@ -1471,6 +1663,8 @@ ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: v_mov_b32_e32 v3, 0 ; FIJI-NEXT: v_mov_b32_e32 v4, v3 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x1 unorm tfe ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, v4 @@ -1480,6 +1674,8 @@ ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v3, 0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v3 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x1 unorm tfe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: v_mov_b32_e32 v0, v4 @@ -1488,6 +1684,8 @@ ; NOPRT-LABEL: load_mip_2d_tfe_dmask0: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: image_load_mip v[2:3], v[0:2], s[0:7] dmask:0x1 unorm tfe ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: v_mov_b32_e32 v0, v3 @@ -1498,6 +1696,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x11,0x05,0xf0,0x00,0x03,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; encoding: [0x04,0x03,0x00,0x7e] @@ -1523,6 +1723,8 @@ ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: v_mov_b32_e32 v3, 0 ; FIJI-NEXT: v_mov_b32_e32 v4, v3 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x1 unorm tfe ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, v4 @@ -1532,6 +1734,8 @@ ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v3, 0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v3 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x1 unorm tfe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: v_mov_b32_e32 v0, v4 @@ -1540,6 +1744,8 @@ ; NOPRT-LABEL: load_mip_2d_tfe_nouse: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: image_load_mip v[2:3], v[0:2], s[0:7] dmask:0x1 unorm tfe ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: v_mov_b32_e32 v0, v3 @@ -1550,6 +1756,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x11,0x05,0xf0,0x00,0x03,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; encoding: [0x04,0x03,0x00,0x7e] @@ -1575,6 +1783,8 @@ ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: v_mov_b32_e32 v3, 0 ; FIJI-NEXT: v_mov_b32_e32 v4, v3 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x1 unorm tfe ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, v4 @@ -1584,6 +1794,8 @@ ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v3, 0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v3 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x1 unorm tfe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: v_mov_b32_e32 v0, v4 @@ -1592,6 +1804,8 @@ ; NOPRT-LABEL: load_mip_2d_tfe_nouse_V2: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: image_load_mip v[2:3], v[0:2], s[0:7] dmask:0x1 unorm tfe ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: v_mov_b32_e32 v0, v3 @@ -1602,6 +1816,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x11,0x05,0xf0,0x00,0x03,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; encoding: [0x04,0x03,0x00,0x7e] @@ -1627,6 +1843,8 @@ ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: v_mov_b32_e32 v3, 0 ; FIJI-NEXT: v_mov_b32_e32 v4, v3 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x2 unorm tfe ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, v4 @@ -1636,6 +1854,8 @@ ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v3, 0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v3 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x2 unorm tfe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: v_mov_b32_e32 v0, v4 @@ -1644,6 +1864,8 @@ ; NOPRT-LABEL: load_mip_2d_tfe_nouse_V1: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: image_load_mip v[2:3], v[0:2], s[0:7] dmask:0x2 unorm tfe ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: v_mov_b32_e32 v0, v3 @@ -1654,6 +1876,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x12,0x05,0xf0,0x00,0x03,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; encoding: [0x04,0x03,0x00,0x7e] @@ -1688,10 +1912,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v1, v0 ; FIJI-NEXT: v_mov_b32_e32 v2, v0 ; FIJI-NEXT: v_mov_b32_e32 v3, v0 -; FIJI-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 unorm tfe ; FIJI-NEXT: s_mov_b32 s11, 0xf000 ; FIJI-NEXT: s_mov_b32 s10, -1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 unorm tfe +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: buffer_store_dword v3, off, s[8:11], 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog @@ -1703,10 +1931,14 @@ ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 +; GFX6789-NEXT: v_mov_b32_e32 v5, s9 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 unorm tfe ; GFX6789-NEXT: v_mov_b32_e32 v4, s8 -; GFX6789-NEXT: v_mov_b32_e32 v5, s9 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[4:5], v3, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1714,10 +1946,14 @@ ; NOPRT-LABEL: load_1d_tfe_V4_dmask3: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v3, 0 -; NOPRT-NEXT: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm tfe ; NOPRT-NEXT: v_mov_b32_e32 v4, s8 ; NOPRT-NEXT: v_mov_b32_e32 v5, s9 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm tfe +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: global_store_dword v[4:5], v3, off ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog @@ -1731,9 +1967,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x17,0x01,0xf0,0x04,0x00,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v4, s8 ; encoding: [0x08,0x02,0x08,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[4:5], v3, off ; encoding: [0x00,0x80,0x70,0xdc,0x04,0x03,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -1766,10 +2006,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v0, 0 ; FIJI-NEXT: v_mov_b32_e32 v1, v0 ; FIJI-NEXT: v_mov_b32_e32 v2, v0 -; FIJI-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 unorm tfe ; FIJI-NEXT: s_mov_b32 s11, 0xf000 ; FIJI-NEXT: s_mov_b32 s10, -1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 unorm tfe +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: buffer_store_dword v2, off, s[8:11], 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog @@ -1780,10 +2024,14 @@ ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 +; GFX6789-NEXT: v_mov_b32_e32 v4, s9 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 unorm tfe ; GFX6789-NEXT: v_mov_b32_e32 v3, s8 -; GFX6789-NEXT: v_mov_b32_e32 v4, s9 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[3:4], v2, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1791,10 +2039,14 @@ ; NOPRT-LABEL: load_1d_tfe_V4_dmask2: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v2, 0 -; NOPRT-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x6 unorm tfe ; NOPRT-NEXT: v_mov_b32_e32 v3, s8 ; NOPRT-NEXT: v_mov_b32_e32 v4, s9 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x6 unorm tfe +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: global_store_dword v[3:4], v2, off ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog @@ -1807,9 +2059,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x16,0x01,0xf0,0x03,0x00,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; encoding: [0x08,0x02,0x06,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[3:4], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x02,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -1840,10 +2096,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v2, v0 ; FIJI-NEXT: v_mov_b32_e32 v0, 0 ; FIJI-NEXT: v_mov_b32_e32 v1, v0 -; FIJI-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe ; FIJI-NEXT: s_mov_b32 s11, 0xf000 ; FIJI-NEXT: s_mov_b32 s10, -1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: buffer_store_dword v1, off, s[8:11], 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog @@ -1853,10 +2113,14 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 +; GFX6789-NEXT: v_mov_b32_e32 v3, s9 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe ; GFX6789-NEXT: v_mov_b32_e32 v2, s8 -; GFX6789-NEXT: v_mov_b32_e32 v3, s9 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[2:3], v1, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1864,10 +2128,14 @@ ; NOPRT-LABEL: load_1d_tfe_V4_dmask1: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v1, 0 -; NOPRT-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x8 unorm tfe ; NOPRT-NEXT: v_mov_b32_e32 v2, s8 ; NOPRT-NEXT: v_mov_b32_e32 v3, s9 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x8 unorm tfe +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: global_store_dword v[2:3], v1, off ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog @@ -1879,9 +2147,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s9 ; encoding: [0x09,0x02,0x06,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x18,0x01,0xf0,0x02,0x00,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; encoding: [0x08,0x02,0x04,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -1912,10 +2184,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v2, v0 ; FIJI-NEXT: v_mov_b32_e32 v0, 0 ; FIJI-NEXT: v_mov_b32_e32 v1, v0 -; FIJI-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe ; FIJI-NEXT: s_mov_b32 s11, 0xf000 ; FIJI-NEXT: s_mov_b32 s10, -1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: buffer_store_dword v1, off, s[8:11], 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog @@ -1925,10 +2201,14 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 +; GFX6789-NEXT: v_mov_b32_e32 v3, s9 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe ; GFX6789-NEXT: v_mov_b32_e32 v2, s8 -; GFX6789-NEXT: v_mov_b32_e32 v3, s9 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[2:3], v1, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1936,10 +2216,14 @@ ; NOPRT-LABEL: load_1d_tfe_V2_dmask1: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v1, 0 -; NOPRT-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x8 unorm tfe ; NOPRT-NEXT: v_mov_b32_e32 v2, s8 ; NOPRT-NEXT: v_mov_b32_e32 v3, s9 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: s_nop 0 +; NOPRT-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x8 unorm tfe +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: global_store_dword v[2:3], v1, off ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog @@ -1951,9 +2235,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s9 ; encoding: [0x09,0x02,0x06,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x18,0x01,0xf0,0x02,0x00,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; encoding: [0x08,0x02,0x04,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -3208,7 +3496,9 @@ ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm ; FIJI-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf unorm +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf unorm ; FIJI-NEXT: s_endpgm ; @@ -3216,7 +3506,9 @@ ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm ; GFX6789-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf unorm +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf unorm ; GFX6789-NEXT: s_endpgm ; @@ -3224,7 +3516,9 @@ ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm ; NOPRT-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf unorm +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; @@ -3233,7 +3527,9 @@ ; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x20,0xf0,0x04,0x00,0x00,0x00] ; GFX10-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x00,0xf0,0x04,0x00,0x02,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x20,0xf0,0x04,0x00,0x04,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll @@ -10,6 +10,7 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -22,6 +23,7 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -38,6 +40,7 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -50,6 +53,7 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -66,6 +70,7 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -78,6 +83,7 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -94,6 +100,7 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -106,6 +113,7 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -122,6 +130,7 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -134,6 +143,7 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -152,6 +162,7 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_gather4_c_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -164,6 +175,7 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -180,6 +192,7 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -192,6 +205,7 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -208,6 +222,7 @@ ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -220,6 +235,7 @@ ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -238,6 +254,7 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_gather4_b_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -250,6 +267,7 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -269,6 +287,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-NEXT: v_lshl_or_b32 v6, v3, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[4:7], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -281,6 +300,7 @@ ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -294,6 +314,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -303,6 +325,8 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -318,6 +342,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_gather4_c_l v[0:3], v[3:5], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -327,6 +353,8 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -340,6 +368,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -349,6 +379,8 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -362,6 +394,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -371,6 +405,8 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,NONSA %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,NSA %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,NONSA %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,NSA %s ; GCN-LABEL: {{^}}sample_2d: ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -8,6 +8,7 @@ ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -18,6 +19,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -34,6 +36,7 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -46,6 +49,7 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -62,6 +66,7 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -74,6 +79,7 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -90,6 +96,7 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -102,6 +109,7 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -118,6 +126,7 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -130,6 +139,7 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -146,6 +156,7 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -158,6 +169,7 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -172,6 +184,7 @@ ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -182,6 +195,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -198,6 +212,7 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -210,6 +225,7 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -226,6 +242,7 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -238,6 +255,7 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -254,6 +272,7 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -266,6 +285,7 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -282,6 +302,7 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -294,6 +315,7 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -312,6 +334,7 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -324,6 +347,7 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -338,6 +362,7 @@ ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -348,6 +373,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -364,6 +390,7 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -376,6 +403,7 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -390,6 +418,7 @@ ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -400,6 +429,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -416,6 +446,7 @@ ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -428,6 +459,7 @@ ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -444,6 +476,7 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_b_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -456,6 +489,7 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -474,6 +508,7 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_b_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -486,6 +521,7 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -502,6 +538,7 @@ ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -514,6 +551,7 @@ ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -533,6 +571,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-NEXT: v_lshl_or_b32 v6, v3, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_b_cl v[0:3], v[4:7], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -545,6 +584,7 @@ ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -581,6 +621,8 @@ ; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_d v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -595,6 +637,8 @@ ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_d v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -616,6 +660,8 @@ ; GFX9-NEXT: v_lshl_or_b32 v11, v7, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_d v[0:3], v[7:14], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -630,6 +676,8 @@ ; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_d v[0:3], [v0, v2, v3, v5, v6, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -668,6 +716,8 @@ ; GFX9-NEXT: v_and_b32_e32 v2, v9, v7 ; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -682,6 +732,8 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: v_lshl_or_b32 v6, v6, 16, v5 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v1, v3, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -695,6 +747,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -704,6 +758,8 @@ ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -722,6 +778,8 @@ ; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_d_cl v[0:3], v[3:6], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -736,6 +794,8 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_d_cl v[0:3], [v0, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -749,6 +809,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -758,6 +820,8 @@ ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -778,6 +842,8 @@ ; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3 ; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_d_cl v[0:3], v[7:14], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -792,6 +858,8 @@ ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: v_lshl_or_b32 v6, v4, 16, v3 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v1, v6, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -828,6 +896,8 @@ ; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_cd v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -842,6 +912,8 @@ ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_cd v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -880,6 +952,8 @@ ; GFX9-NEXT: v_and_b32_e32 v2, v9, v7 ; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -894,6 +968,8 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: v_lshl_or_b32 v6, v6, 16, v5 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v1, v3, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -907,6 +983,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -916,6 +994,8 @@ ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -934,6 +1014,8 @@ ; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_cd_cl v[0:3], v[3:6], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -948,6 +1030,8 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_cd_cl v[0:3], [v0, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -961,6 +1045,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -970,6 +1056,8 @@ ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -990,6 +1078,8 @@ ; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3 ; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[7:14], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -1004,6 +1094,8 @@ ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: v_lshl_or_b32 v6, v4, 16, v3 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v1, v6, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -1017,6 +1109,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -1026,6 +1120,8 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -1039,6 +1135,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -1048,6 +1146,8 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -1061,6 +1161,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -1070,6 +1172,8 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -1085,6 +1189,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_l v[0:3], v[3:5], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -1094,6 +1200,8 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -1125,6 +1233,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -1134,6 +1244,8 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -1165,6 +1277,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -1174,6 +1288,8 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -1195,6 +1311,8 @@ ; GFX9-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v10, v3, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_d_o v0, v[8:15], s[0:7], s[8:11] dmask:0x4 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -1209,6 +1327,8 @@ ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v7, v7, 16, v6 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v2, v4, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -1230,6 +1350,8 @@ ; GFX9-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v10, v3, 16, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_d_o v[0:1], v[8:15], s[0:7], s[8:11] dmask:0x6 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -1244,6 +1366,8 @@ ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v7, v7, 16, v6 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v2, v4, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -10,6 +10,7 @@ ; TONGA-NEXT: s_mov_b64 s[12:13], exec ; TONGA-NEXT: s_wqm_b64 exec, exec ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: ; return to shader part epilog @@ -29,6 +30,7 @@ ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -39,6 +41,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -57,9 +60,12 @@ ; TONGA-NEXT: v_mov_b32_e32 v5, s13 ; TONGA-NEXT: v_mov_b32_e32 v3, v2 ; TONGA-NEXT: s_and_b64 exec, exec, s[14:15] +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, v2 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: flat_store_dword v[4:5], v3 ; TONGA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; TONGA-NEXT: ; return to shader part epilog @@ -92,9 +98,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, s13 ; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[4:5], v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -109,9 +118,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, s13 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[4:5], v3, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog @@ -162,6 +174,8 @@ ; TONGA-NEXT: v_mov_b32_e32 v4, 0 ; TONGA-NEXT: v_mov_b32_e32 v5, v4 ; TONGA-NEXT: v_mov_b32_e32 v6, v4 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: image_sample_c_d v[4:6], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v5 @@ -185,6 +199,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_c_d v[4:5], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -198,6 +214,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v4, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -218,6 +236,7 @@ ; TONGA-NEXT: s_mov_b64 s[12:13], exec ; TONGA-NEXT: s_wqm_b64 exec, exec ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf d16 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -241,6 +260,7 @@ ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -251,6 +271,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -271,6 +292,7 @@ ; TONGA-NEXT: v_mov_b32_e32 v6, v3 ; TONGA-NEXT: v_mov_b32_e32 v7, v3 ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: image_sample_b v[3:7], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v4 @@ -304,6 +326,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -323,6 +346,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -18,6 +18,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -28,6 +29,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -69,8 +71,11 @@ ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf tfe +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[6:7], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -89,8 +94,11 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0f,0x81,0xf0,0x05,0x00,0x40,0x00] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -123,6 +131,7 @@ ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 tfe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -136,6 +145,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x01,0x81,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -171,6 +181,7 @@ ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 tfe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -184,6 +195,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x02,0x81,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -219,6 +231,7 @@ ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 tfe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -232,6 +245,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x04,0x81,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -267,6 +281,7 @@ ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 tfe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -280,6 +295,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x08,0x81,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -317,6 +333,7 @@ ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 tfe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -331,6 +348,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x03,0x81,0xf0,0x03,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -370,6 +388,7 @@ ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa tfe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -384,6 +403,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0a,0x81,0xf0,0x03,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -425,6 +445,7 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd tfe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -440,6 +461,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0d,0x81,0xf0,0x04,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -491,8 +513,11 @@ ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf lwe +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[6:7], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -511,8 +536,11 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; encoding: [0x00,0x0f,0x82,0xf0,0x05,0x00,0x40,0x00] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -539,6 +567,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -549,6 +578,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -572,6 +602,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -582,6 +613,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x10,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -605,6 +637,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf da ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -615,6 +648,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE ; encoding: [0x18,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -638,6 +672,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf da ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -648,6 +683,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x20,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -671,6 +707,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf da ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -681,6 +718,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x28,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -704,6 +742,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -714,6 +753,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa0,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -737,6 +777,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -747,6 +788,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa0,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -770,6 +812,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -780,6 +823,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x84,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -803,6 +847,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -813,6 +858,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x84,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -836,6 +882,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -846,6 +893,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa4,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -869,6 +917,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -879,6 +928,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa4,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -902,6 +952,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -912,6 +963,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x94,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -935,6 +987,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -945,6 +998,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x94,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -968,6 +1022,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -978,6 +1033,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xb4,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -1001,6 +1057,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1011,6 +1068,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xb4,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -1034,6 +1092,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1044,6 +1103,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x98,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -1067,6 +1127,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1077,6 +1138,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x98,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -1100,6 +1162,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1110,6 +1173,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xb8,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -1133,6 +1197,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1143,6 +1208,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xb8,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -1769,10 +1835,14 @@ ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v9, 0 ; GFX6789-NEXT: v_mov_b32_e32 v10, v9 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da ; GFX6789-NEXT: v_mov_b32_e32 v0, s12 ; GFX6789-NEXT: v_mov_b32_e32 v1, s13 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: global_store_dword v[0:1], v10, off ; GFX6789-NEXT: v_mov_b32_e32 v0, v9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) @@ -1786,9 +1856,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v11, s13 ; encoding: [0x0d,0x02,0x16,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_c_d_o v[0:1], [v10, v9, v2, v3, v4, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x2c,0x04,0xe9,0xf0,0x0a,0x00,0x40,0x00,0x09,0x02,0x03,0x04,0x05,0x06,0x07,0x08] ; GFX10-NEXT: v_mov_b32_e32 v10, s12 ; encoding: [0x0c,0x02,0x14,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: global_store_dword v[10:11], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x01,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog @@ -1842,6 +1916,8 @@ ; GFX6789-NEXT: v_mov_b32_e32 v9, 0 ; GFX6789-NEXT: v_mov_b32_e32 v10, v9 ; GFX6789-NEXT: v_mov_b32_e32 v11, v9 +; GFX6789-NEXT: s_nop 0 +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: v_mov_b32_e32 v0, v9 @@ -1858,6 +1934,8 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample_c_d_o v[0:2], [v11, v10, v9, v3, v4, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x2c,0x06,0xe9,0xf0,0x0b,0x00,0x40,0x00,0x0a,0x09,0x03,0x04,0x05,0x06,0x07,0x08] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -1889,6 +1967,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf unorm ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1899,6 +1978,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -1922,6 +2002,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf glc ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1932,6 +2013,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D glc ; encoding: [0x00,0x2f,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -1955,6 +2037,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf slc ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1965,6 +2048,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D slc ; encoding: [0x00,0x0f,0x80,0xf2,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -1988,6 +2072,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf glc slc ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -1998,6 +2083,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D glc slc ; encoding: [0x00,0x2f,0x80,0xf2,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -2021,6 +2107,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -2031,6 +2118,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x01,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -2055,6 +2143,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x3 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -2065,6 +2154,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x03,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -2089,6 +2179,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0x7 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -2099,6 +2190,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x07,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -2123,6 +2215,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -2133,6 +2226,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x06,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -2157,6 +2251,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -2167,6 +2262,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x9 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x09,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -2191,6 +2287,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -2201,6 +2298,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0a,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -2225,6 +2323,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0xe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -2235,6 +2334,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0xe dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0e,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -2277,6 +2377,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -2287,6 +2388,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x06,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -2311,6 +2413,7 @@ ; GFX6789-NEXT: s_mov_b64 s[12:13], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog @@ -2321,6 +2424,7 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0a,0x80,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll @@ -1,5 +1,5 @@ ;RUN: llc < %s -march=amdgcn -mcpu=verde -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI -;RUN: llc < %s -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-xnack -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI ;CHECK-LABEL: {{^}}test1: ;CHECK-NOT: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -145,7 +145,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_or: ;CHECK-NEXT: %bb. ;CHECK-NEXT: v_lshlrev_b32_e32 v{{[0-9]}}, 6, v0 -;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 +;CHECK: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28 ;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) { @@ -214,7 +214,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged_or: ;CHECK-NEXT: %bb. ;CHECK-NEXT: v_lshlrev_b32_e32 v{{[0-9]}}, 4, v0 -;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 +;CHECK: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 ;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -192,7 +192,7 @@ ;CHECK-LABEL: {{^}}raw_buffer_store_byte: ;CHECK-NEXT: %bb. ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 +;CHECK: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 ;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { main_body: @@ -205,7 +205,7 @@ ;CHECK-LABEL: {{^}}raw_buffer_store_short: ;CHECK-NEXT: %bb. ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 +;CHECK: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 ;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -50,45 +50,57 @@ ; VARIANT2-LABEL: test_barrier: ; VARIANT2: ; %bb.0: ; %entry ; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c +; VARIANT2-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT2-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT2-NEXT: v_mov_b32_e32 v4, s3 -; VARIANT2-NEXT: v_xad_u32 v1, v0, -1, s0 +; VARIANT2-NEXT: v_xad_u32 v1, v0, -1, s4 ; VARIANT2-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT2-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3 ; VARIANT2-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2] ; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; VARIANT2-NEXT: global_store_dword v[3:4], v0, off ; VARIANT2-NEXT: v_mov_b32_e32 v5, s3 +; VARIANT2-NEXT: s_nop 0 +; VARIANT2-NEXT: s_nop 0 +; VARIANT2-NEXT: global_store_dword v[3:4], v0, off ; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1 ; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc ; VARIANT2-NEXT: s_waitcnt vmcnt(0) ; VARIANT2-NEXT: s_barrier +; VARIANT2-NEXT: s_nop 0 +; VARIANT2-NEXT: s_nop 0 ; VARIANT2-NEXT: global_load_dword v0, v[0:1], off +; VARIANT2-NEXT: s_nop 0 ; VARIANT2-NEXT: s_waitcnt vmcnt(0) +; VARIANT2-NEXT: s_nop 0 ; VARIANT2-NEXT: global_store_dword v[3:4], v0, off ; VARIANT2-NEXT: s_endpgm ; ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry ; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c +; VARIANT3-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT3-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT3-NEXT: v_mov_b32_e32 v4, s3 -; VARIANT3-NEXT: v_xad_u32 v1, v0, -1, s0 +; VARIANT3-NEXT: v_xad_u32 v1, v0, -1, s4 ; VARIANT3-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT3-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3 ; VARIANT3-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2] ; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; VARIANT3-NEXT: global_store_dword v[3:4], v0, off ; VARIANT3-NEXT: v_mov_b32_e32 v5, s3 +; VARIANT3-NEXT: s_nop 0 +; VARIANT3-NEXT: s_nop 0 +; VARIANT3-NEXT: global_store_dword v[3:4], v0, off ; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1 ; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc ; VARIANT3-NEXT: s_barrier +; VARIANT3-NEXT: s_nop 0 +; VARIANT3-NEXT: s_nop 0 ; VARIANT3-NEXT: global_load_dword v0, v[0:1], off +; VARIANT3-NEXT: s_nop 0 ; VARIANT3-NEXT: s_waitcnt vmcnt(0) +; VARIANT3-NEXT: s_nop 0 ; VARIANT3-NEXT: global_store_dword v[3:4], v0, off ; VARIANT3-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-xnack -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}test1: ; CHECK-NOT: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -111,7 +111,7 @@ ;CHECK-LABEL: {{^}}struct_buffer_store_byte: ;CHECK-NEXT: %bb. ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen +;CHECK: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen ;CHECK-NEXT: s_endpgm define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1, i32 %index) { main_body: @@ -124,7 +124,7 @@ ;CHECK-LABEL: {{^}}struct_buffer_store_f16: ;CHECK-NEXT: %bb. ;CHECK-NEXT: v_cvt_f16_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen +;CHECK: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen ;CHECK-NEXT: s_endpgm define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1, i32 %index) { %v2 = fptrunc float %v1 to half @@ -151,7 +151,7 @@ ;CHECK-LABEL: {{^}}struct_buffer_store_i16: ;CHECK-NEXT: %bb. ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen +;CHECK: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen ;CHECK-NEXT: s_endpgm define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1, i32 %index) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -18,12 +18,14 @@ ; VI-LABEL: bfe_u32_arg_arg_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_bfe_u32 v0, v0, s1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_bfe_u32 v0, v0, s3, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1) @@ -48,13 +50,15 @@ ; VI-LABEL: bfe_u32_arg_arg_imm: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_mov_b32_e32 v1, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_bfe_u32 v0, s0, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_bfe_u32 v0, s2, v0, v1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123) @@ -79,13 +83,15 @@ ; VI-LABEL: bfe_u32_arg_imm_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_mov_b32_e32 v0, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_bfe_u32 v0, s0, v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_bfe_u32 v0, s2, v0, v1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2) @@ -111,14 +117,16 @@ ; VI-LABEL: bfe_u32_imm_arg_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_movk_i32 s2, 0x7b +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_movk_i32 s8, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_bfe_u32 v0, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_bfe_u32 v0, s8, v0, v1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2) @@ -143,7 +151,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0) @@ -168,7 +178,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0) @@ -206,8 +218,12 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %load = load i8, i8 addrspace(1)* %in @@ -250,10 +266,14 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 @@ -296,10 +316,14 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 @@ -343,11 +367,15 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; VI-NEXT: v_and_b32_e32 v0, 0xfe, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 1, 8 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 @@ -391,11 +419,15 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; VI-NEXT: v_and_b32_e32 v0, 0xf8, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 3, 8 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 @@ -439,11 +471,15 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; VI-NEXT: v_and_b32_e32 v0, 0x80, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 7, 8 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 @@ -486,10 +522,14 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 8, 8 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %load = load i32, i32 addrspace(1)* %in, align 4 @@ -531,9 +571,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 1, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 @@ -560,6 +604,8 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 @@ -587,6 +633,8 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 @@ -614,6 +662,8 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 @@ -655,9 +705,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bfe_i32 v0, v0, 0, 1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 @@ -700,10 +754,14 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 31, v0 ; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 @@ -744,9 +802,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 31, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 @@ -787,9 +849,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 1, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 @@ -830,9 +896,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 31, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 @@ -872,9 +942,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 @@ -914,9 +988,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 @@ -956,9 +1034,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 @@ -999,9 +1081,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 31, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 @@ -1028,6 +1114,8 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 @@ -1054,7 +1142,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0) @@ -1080,7 +1170,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0) @@ -1106,7 +1198,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1) @@ -1132,7 +1226,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1) @@ -1158,7 +1254,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -1 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1) @@ -1184,7 +1282,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1) @@ -1210,7 +1310,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x80 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8) @@ -1236,7 +1338,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8) @@ -1262,7 +1366,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8) @@ -1288,7 +1394,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8) @@ -1314,7 +1422,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16) @@ -1340,7 +1450,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 10 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4) @@ -1366,7 +1478,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1) @@ -1392,7 +1506,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16) @@ -1418,7 +1534,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 40 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30) @@ -1444,7 +1562,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 10 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28) @@ -1470,7 +1590,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7) @@ -1496,7 +1618,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31) @@ -1522,7 +1646,9 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1) @@ -1565,14 +1691,18 @@ ; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 63, v0 ; VI-NEXT: v_bfe_u32 v1, v0, 2, 2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1602,12 +1732,14 @@ ; VI-LABEL: lshr_and: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s0, s0, 0x30006 +; VI-NEXT: s_bfe_u32 s0, s2, 0x30006 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %b = lshr i32 %a, 6 @@ -1632,12 +1764,14 @@ ; VI-LABEL: v_lshr_and: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_bfe_u32 v0, s0, v0, 3 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_bfe_u32 v0, s2, v0, 3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %c = lshr i32 %a, %b @@ -1662,12 +1796,14 @@ ; VI-LABEL: and_lshr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s0, s0, 0x30006 +; VI-NEXT: s_bfe_u32 s0, s2, 0x30006 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %b = and i32 %a, 448 @@ -1692,12 +1828,14 @@ ; VI-LABEL: and_lshr2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s0, s0, 0x30006 +; VI-NEXT: s_bfe_u32 s0, s2, 0x30006 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %b = and i32 %a, 511 @@ -1722,12 +1860,14 @@ ; VI-LABEL: shl_lshr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s0, s0, 0x150002 +; VI-NEXT: s_bfe_u32 s0, s2, 0x150002 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %b = shl i32 %a, 9 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -32,6 +32,8 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -39,6 +41,8 @@ ; GFX8-NEXT: v_fract_f16_e32 v0, v0 ; GFX8-NEXT: v_cos_f16_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -48,12 +52,16 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0 ; GFX9-NEXT: v_cos_f16_e32 v2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm %a.val = load half, half addrspace(1)* %a @@ -100,6 +108,8 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -112,6 +122,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -122,6 +134,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v0 @@ -132,6 +146,8 @@ ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %a.val = load <2 x half>, <2 x half> addrspace(1)* %a diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -52,6 +52,8 @@ ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -59,6 +61,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -77,6 +81,8 @@ ; GFX9-NEXT: s_mov_b32 s6, s2 ; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -84,6 +90,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm half addrspace(1)* %r, @@ -131,10 +139,14 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -150,10 +162,14 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s2 ; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm half addrspace(1)* %r, @@ -199,10 +215,14 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -218,10 +238,14 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s2 ; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm half addrspace(1)* %r, @@ -286,6 +310,8 @@ ; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -304,6 +330,8 @@ ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 ; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, @@ -357,6 +385,8 @@ ; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 ; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -373,6 +403,8 @@ ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: s_mov_b32 s4, 0x44004200 ; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, @@ -424,6 +456,8 @@ ; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 ; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -440,6 +474,8 @@ ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: s_mov_b32 s4, 0x42004400 ; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, @@ -501,20 +537,22 @@ ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v1, s4, s4 -; VI-NEXT: v_max_f16_e64 v0, s6, s6 +; VI-NEXT: v_max_f16_e64 v0, s10, s10 ; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: s_lshr_b32 s6, s6, 16 +; VI-NEXT: s_lshr_b32 s6, s10, 16 ; VI-NEXT: v_max_f16_e32 v0, v1, v0 ; VI-NEXT: v_max_f16_e64 v1, s6, s6 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_max_f16_e64 v1, s11, s11 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_e32 v1, v2, v1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -529,14 +567,16 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: v_pk_max_f16 v0, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -610,26 +650,28 @@ ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v1, s5, s5 -; VI-NEXT: v_max_f16_e64 v0, s7, s7 +; VI-NEXT: v_max_f16_e64 v0, s11, s11 ; VI-NEXT: s_lshr_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: s_lshr_b32 s6, s11, 16 ; VI-NEXT: v_max_f16_e32 v0, v1, v0 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 -; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_max_f16_e64 v1, s6, s6 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_max_f16_e64 v2, s4, s4 -; VI-NEXT: v_max_f16_e64 v0, s6, s6 +; VI-NEXT: v_max_f16_e64 v0, s10, s10 ; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: s_lshr_b32 s5, s6, 16 +; VI-NEXT: s_lshr_b32 s5, s10, 16 ; VI-NEXT: v_max_f16_e32 v0, v2, v0 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_e64 v3, s4, s4 ; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -643,14 +685,16 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <4 x half> addrspace(1)* %r, @@ -724,6 +768,8 @@ ; VI-NEXT: v_mov_b32_e32 v3, 0x4000 ; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -743,6 +789,8 @@ ; GFX9-NEXT: v_pk_max_f16 v2, s4, s4 ; GFX9-NEXT: v_pk_max_f16 v1, v0, s8 ; GFX9-NEXT: v_pk_max_f16 v0, v2, s9 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <4 x half> addrspace(1)* %r, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -52,6 +52,8 @@ ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -59,6 +61,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -77,6 +81,8 @@ ; GFX9-NEXT: s_mov_b32 s6, s2 ; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -84,6 +90,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm half addrspace(1)* %r, @@ -154,10 +162,14 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -173,10 +185,14 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s2 ; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm half addrspace(1)* %r, @@ -222,10 +238,14 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -241,10 +261,14 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s2 ; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm half addrspace(1)* %r, @@ -309,6 +333,8 @@ ; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -327,6 +353,8 @@ ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 ; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, @@ -410,6 +438,8 @@ ; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 ; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -426,6 +456,8 @@ ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: s_mov_b32 s4, 0x44004200 ; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, @@ -477,6 +509,8 @@ ; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 ; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -493,6 +527,8 @@ ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: s_mov_b32 s4, 0x42004400 ; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, @@ -554,20 +590,22 @@ ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v1, s4, s4 -; VI-NEXT: v_max_f16_e64 v0, s6, s6 +; VI-NEXT: v_max_f16_e64 v0, s10, s10 ; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: s_lshr_b32 s6, s6, 16 +; VI-NEXT: s_lshr_b32 s6, s10, 16 ; VI-NEXT: v_min_f16_e32 v0, v1, v0 ; VI-NEXT: v_max_f16_e64 v1, s6, s6 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_max_f16_e64 v1, s11, s11 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_min_f16_e32 v1, v2, v1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -582,14 +620,16 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: v_pk_max_f16 v0, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -663,26 +703,28 @@ ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v1, s5, s5 -; VI-NEXT: v_max_f16_e64 v0, s7, s7 +; VI-NEXT: v_max_f16_e64 v0, s11, s11 ; VI-NEXT: s_lshr_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: s_lshr_b32 s6, s11, 16 ; VI-NEXT: v_min_f16_e32 v0, v1, v0 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 -; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_max_f16_e64 v1, s6, s6 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_max_f16_e64 v2, s4, s4 -; VI-NEXT: v_max_f16_e64 v0, s6, s6 +; VI-NEXT: v_max_f16_e64 v0, s10, s10 ; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: s_lshr_b32 s5, s6, 16 +; VI-NEXT: s_lshr_b32 s5, s10, 16 ; VI-NEXT: v_min_f16_e32 v0, v2, v0 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_e64 v3, s4, s4 ; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -696,14 +738,16 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <4 x half> addrspace(1)* %r, @@ -777,6 +821,8 @@ ; VI-NEXT: v_mov_b32_e32 v3, 0x4000 ; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -796,6 +842,8 @@ ; GFX9-NEXT: v_pk_max_f16 v2, s4, s4 ; GFX9-NEXT: v_pk_min_f16 v1, v0, s8 ; GFX9-NEXT: v_pk_min_f16 v0, v2, s9 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <4 x half> addrspace(1)* %r, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -137,7 +137,7 @@ ; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 ; GCN-NOT: 0xffff ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NEXT: buffer_store_dword [[VVAL]] +; GCN: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_x_known_bits(i32 addrspace(1)* %out) { entry: %size = call i32 @llvm.r600.read.local.size.x() #0 @@ -152,7 +152,7 @@ ; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c ; GCN-NOT: 0xffff ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NEXT: buffer_store_dword [[VVAL]] +; GCN: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_y_known_bits(i32 addrspace(1)* %out) { entry: %size = call i32 @llvm.r600.read.local.size.y() #0 @@ -167,7 +167,7 @@ ; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 ; GCN-NOT: 0xffff ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NEXT: buffer_store_dword [[VVAL]] +; GCN: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_z_known_bits(i32 addrspace(1)* %out) { entry: %size = call i32 @llvm.r600.read.local.size.z() #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -32,6 +32,8 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -39,6 +41,8 @@ ; GFX8-NEXT: v_fract_f16_e32 v0, v0 ; GFX8-NEXT: v_sin_f16_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -48,12 +52,16 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0 ; GFX9-NEXT: v_sin_f16_e32 v2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm %a.val = load half, half addrspace(1)* %a @@ -100,6 +108,8 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -112,6 +122,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -122,6 +134,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v0 @@ -132,6 +146,8 @@ ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %a.val = load <2 x half>, <2 x half> addrspace(1)* %a diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s -; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca,-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca,-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo: ; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -75,6 +75,8 @@ ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -86,6 +88,8 @@ ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -98,6 +102,8 @@ ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -179,7 +185,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u16_d16 v1, v0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -192,6 +200,8 @@ ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -204,6 +214,8 @@ ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -223,6 +235,8 @@ ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -234,6 +248,8 @@ ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -246,6 +262,8 @@ ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -262,7 +280,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u8_d16 v1, v0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -274,6 +294,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -287,6 +309,8 @@ ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -307,6 +331,8 @@ ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -318,6 +344,8 @@ ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -330,6 +358,8 @@ ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -347,7 +377,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_i8_d16 v1, v0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -359,6 +391,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -371,6 +405,8 @@ ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -391,6 +427,8 @@ ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -402,6 +440,8 @@ ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -414,6 +454,8 @@ ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -434,6 +476,8 @@ ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -445,6 +489,8 @@ ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -457,6 +503,8 @@ ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -478,6 +526,8 @@ ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -489,6 +539,8 @@ ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -501,6 +553,8 @@ ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -524,6 +578,8 @@ ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: ds_write_b16 v3, v0 ; GFX900-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -537,6 +593,8 @@ ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: ds_write_b16 v3, v0 ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -551,6 +609,8 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: ds_write_b16 v2, v0 ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -571,7 +631,9 @@ ; GFX900-NEXT: ds_read_u16_d16 v1, v0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ds_write_b16 v0, v2 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -586,6 +648,8 @@ ; GFX906-NEXT: ds_write_b16 v4, v3 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -601,6 +665,8 @@ ; GFX803-NEXT: ds_write_b16 v3, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(1) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -624,6 +690,8 @@ ; GFX900-NEXT: ds_write_b16 v2, v0 ; GFX900-NEXT: ds_write_b16 v3, v5 ; GFX900-NEXT: v_bfi_b32 v0, v4, v0, v1 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -638,6 +706,8 @@ ; GFX906-NEXT: ds_write_b16 v2, v0 ; GFX906-NEXT: ds_write_b16 v3, v5 ; GFX906-NEXT: v_bfi_b32 v0, v4, v0, v1 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -653,6 +723,8 @@ ; GFX803-NEXT: ds_write_b16 v2, v0 ; GFX803-NEXT: ds_write_b16 v3, v1 ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -671,7 +743,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -683,6 +757,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -692,10 +768,14 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_load_ushort v0, v[0:1] ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -713,7 +793,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -726,6 +808,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -735,10 +819,14 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_load_ushort v0, v[0:1] ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -756,7 +844,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -768,6 +858,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -777,11 +869,15 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] ; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -800,7 +896,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -812,6 +910,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -821,10 +921,14 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_load_sbyte v0, v[0:1] ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -843,7 +947,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -856,6 +962,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -865,11 +973,15 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] ; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -889,7 +1001,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -902,6 +1016,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -911,10 +1027,14 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_load_sbyte v0, v[0:1] ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -934,7 +1054,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: flat_load_short_d16 v2, v[0:1] +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -946,6 +1068,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -957,6 +1081,8 @@ ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -973,7 +1099,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: flat_load_short_d16 v2, v[0:1] +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -986,6 +1114,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -997,6 +1127,8 @@ ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1015,7 +1147,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1] +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1027,6 +1161,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1039,6 +1175,8 @@ ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1056,7 +1194,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1] +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1068,6 +1208,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1079,6 +1221,8 @@ ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1096,7 +1240,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1] +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1109,6 +1255,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1121,6 +1269,8 @@ ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1139,7 +1289,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1] +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1152,6 +1304,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1163,6 +1317,8 @@ ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1181,7 +1337,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1193,6 +1351,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1204,6 +1364,8 @@ ; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1224,6 +1386,8 @@ ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1235,6 +1399,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1246,6 +1412,8 @@ ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1263,7 +1431,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1276,6 +1446,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1287,6 +1459,8 @@ ; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1304,7 +1478,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1316,6 +1492,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1327,6 +1505,8 @@ ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1343,7 +1523,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1355,6 +1537,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1366,6 +1550,8 @@ ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1382,7 +1568,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1395,6 +1583,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1406,6 +1596,8 @@ ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1422,7 +1614,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1434,6 +1628,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1446,6 +1642,8 @@ ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1464,7 +1662,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1476,6 +1676,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1487,6 +1689,8 @@ ; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1505,7 +1709,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1517,6 +1723,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1529,6 +1737,8 @@ ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1546,7 +1756,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1558,6 +1770,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1569,6 +1783,8 @@ ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1586,7 +1802,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1599,6 +1817,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1611,6 +1831,8 @@ ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1629,7 +1851,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1641,6 +1865,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1650,10 +1876,14 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_load_ushort v0, v[0:1] ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1671,7 +1901,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1684,6 +1916,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1693,10 +1927,14 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_load_ushort v0, v[0:1] ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1714,7 +1952,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1727,6 +1967,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1736,11 +1978,15 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] ; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1760,7 +2006,9 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1773,6 +2021,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1782,10 +2032,14 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_load_sbyte v0, v[0:1] ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1805,9 +2059,13 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1816,11 +2074,15 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1829,11 +2091,15 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1855,9 +2121,13 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1866,11 +2136,15 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1879,11 +2153,15 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 -; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1906,9 +2184,13 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1917,11 +2199,15 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1930,12 +2216,16 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 ; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1958,9 +2248,13 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1969,12 +2263,16 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1983,11 +2281,15 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 -; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -2011,9 +2313,13 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2022,12 +2328,16 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2036,12 +2346,16 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 ; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX803-NEXT: s_nop 0 +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -10,12 +10,14 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v2 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, s5, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -23,20 +25,22 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dword s5, s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x30 +; VI-NEXT: s_load_dword s6, s[0:1], 0x30 ; VI-NEXT: s_mov_b32 s4, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_lshr_b32 s1, s5, 16 -; VI-NEXT: s_lshr_b32 s6, s0, 16 -; VI-NEXT: s_lshr_b32 s1, s1, s6 -; VI-NEXT: s_and_b32 s5, s5, s4 -; VI-NEXT: s_and_b32 s0, s0, s4 -; VI-NEXT: s_lshr_b32 s0, s5, s0 -; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_and_b32 s1, s5, s4 +; VI-NEXT: s_lshr_b32 s0, s5, 16 +; VI-NEXT: s_lshr_b32 s5, s6, 16 +; VI-NEXT: s_lshr_b32 s0, s0, s5 +; VI-NEXT: s_and_b32 s4, s6, s4 +; VI-NEXT: s_lshr_b32 s1, s1, s4 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -74,6 +78,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 @@ -81,6 +87,8 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -94,15 +102,19 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v5 -; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_lshrrev_b16_e32 v2, v6, v5 +; VI-NEXT: v_lshrrev_b16_sdwa v3, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -147,40 +159,48 @@ ; GFX9-LABEL: lshr_v_s_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v3 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, s2, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_s_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v2, s0, v3 +; VI-NEXT: v_lshrrev_b16_e32 v2, s2, v3 ; VI-NEXT: v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -222,40 +242,48 @@ ; GFX9-LABEL: lshr_s_v_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v2, v3, s0 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, v3, s2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_s_v_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e64 v2, v3, s0 +; VI-NEXT: v_lshrrev_b16_e64 v2, v3, s2 ; VI-NEXT: v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -302,12 +330,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v2, v3, 8 op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -320,6 +352,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -328,6 +362,8 @@ ; VI-NEXT: v_lshrrev_b16_e64 v2, v4, 8 ; VI-NEXT: v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -370,12 +406,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v2, 8, v3 op_sel_hi:[0,1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -387,6 +427,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -395,6 +437,8 @@ ; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -428,45 +472,53 @@ ; GFX9-LABEL: v_lshr_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v3 -; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_pk_lshrrev_b16 v3, v5, v3 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v8 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1 -; VI-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v0 -; VI-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v1, v6, v1 -; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: v_lshrrev_b16_e32 v2, v7, v5 +; VI-NEXT: v_lshrrev_b16_sdwa v3, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshrrev_b16_e32 v5, v6, v4 +; VI-NEXT: v_lshrrev_b16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v3, v2, v3 +; VI-NEXT: v_or_b32_e32 v2, v5, v4 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_lshr_v4i16: @@ -523,13 +575,17 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -541,10 +597,12 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v0 @@ -552,6 +610,8 @@ ; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -119,7 +119,7 @@ ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: ; GCN: s_waitcnt ; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} -; GFX9-NEXT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, v3 +; GFX9: global_store_short v{{\[[0-9]+:[0-9]+\]}}, v3 ; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll --- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll @@ -1,8 +1,8 @@ ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN1 -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2 -; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2 +; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2 +; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global,-xnack -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -7,44 +7,52 @@ ; VI-LABEL: v_test_imax_sge_i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_ushort v5, v[0:1] -; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: flat_load_ushort v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v2, v5, v2 +; VI-NEXT: v_max_i16_e32 v2, v5, v6 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ushort v5, v[0:1], off -; GFX9-NEXT: global_load_ushort v2, v[2:3], off +; GFX9-NEXT: global_load_ushort v6, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v2, v5, v2 +; GFX9-NEXT: v_max_i16_e32 v2, v5, v6 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -64,46 +72,54 @@ ; VI-LABEL: v_test_imax_sge_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v3, v5, v2 -; VI-NEXT: v_max_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_max_i16_e32 v2, v5, v6 +; VI-NEXT: v_max_i16_sdwa v3, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v2, v5, v2 +; GFX9-NEXT: v_pk_max_i16 v2, v5, v6 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -123,42 +139,48 @@ ; VI-LABEL: v_test_imax_sge_v3i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v6 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v6 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: v_add_u32_e32 v6, vcc, s4, v6 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_ushort v8, v[4:5] ; VI-NEXT: flat_load_dword v9, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_ushort v10, v[0:1] +; VI-NEXT: flat_load_dword v11, v[2:3] ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_max_i16_e32 v0, v8, v0 +; VI-NEXT: v_max_i16_e32 v2, v8, v10 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v2, v9, v1 -; VI-NEXT: v_max_i16_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: flat_store_short v[4:5], v0 -; VI-NEXT: flat_store_dword v[6:7], v1 +; VI-NEXT: v_max_i16_e32 v0, v9, v11 +; VI-NEXT: v_max_i16_sdwa v1, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_store_short v[4:5], v2 +; VI-NEXT: flat_store_dword v[6:7], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 @@ -166,12 +188,20 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v6, v[0:1], off offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v7, v[0:1], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v5, v[2:3], off offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v2, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 @@ -180,6 +210,8 @@ ; GFX9-NEXT: v_pk_max_i16 v3, v6, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v2, v7, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[0:1], v3, off offset:4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -200,51 +232,59 @@ ; VI-LABEL: v_test_imax_sge_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v8 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v6, v1, v3 -; VI-NEXT: v_max_i16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_i16_e32 v3, v0, v2 -; VI-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v1, v6, v1 -; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: v_max_i16_e32 v2, v5, v7 +; VI-NEXT: v_max_i16_sdwa v3, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_i16_e32 v5, v4, v6 +; VI-NEXT: v_max_i16_sdwa v4, v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v3, v2, v3 +; VI-NEXT: v_or_b32_e32 v2, v5, v4 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v1, v1, v3 -; GFX9-NEXT: v_pk_max_i16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_pk_max_i16 v3, v5, v7 +; GFX9-NEXT: v_pk_max_i16 v2, v4, v6 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid @@ -263,44 +303,52 @@ ; VI-LABEL: v_test_imax_sgt_i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_ushort v5, v[0:1] -; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: flat_load_ushort v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v2, v5, v2 +; VI-NEXT: v_max_i16_e32 v2, v5, v6 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sgt_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ushort v5, v[0:1], off -; GFX9-NEXT: global_load_ushort v2, v[2:3], off +; GFX9-NEXT: global_load_ushort v6, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v2, v5, v2 +; GFX9-NEXT: v_max_i16_e32 v2, v5, v6 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -320,44 +368,52 @@ ; VI-LABEL: v_test_umax_uge_i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_ushort v5, v[0:1] -; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: flat_load_ushort v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v2, v5, v2 +; VI-NEXT: v_max_u16_e32 v2, v5, v6 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_uge_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ushort v5, v[0:1], off -; GFX9-NEXT: global_load_ushort v2, v[2:3], off +; GFX9-NEXT: global_load_ushort v6, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v2, v5, v2 +; GFX9-NEXT: v_max_u16_e32 v2, v5, v6 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -377,44 +433,52 @@ ; VI-LABEL: v_test_umax_ugt_i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_ushort v5, v[0:1] -; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: flat_load_ushort v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v2, v5, v2 +; VI-NEXT: v_max_u16_e32 v2, v5, v6 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_ugt_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ushort v5, v[0:1], off -; GFX9-NEXT: global_load_ushort v2, v[2:3], off +; GFX9-NEXT: global_load_ushort v6, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v2, v5, v2 +; GFX9-NEXT: v_max_u16_e32 v2, v5, v6 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -433,46 +497,54 @@ ; VI-LABEL: v_test_umax_ugt_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v3, v5, v2 -; VI-NEXT: v_max_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_max_u16_e32 v2, v5, v6 +; VI-NEXT: v_max_u16_sdwa v3, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_ugt_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_u16 v2, v5, v2 +; GFX9-NEXT: v_pk_max_u16 v2, v5, v6 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll @@ -1,7 +1,7 @@ -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10WGP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10CU %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode,-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10CU %s ; GCN-LABEL: {{^}}system_one_as_monotonic_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll @@ -1,7 +1,7 @@ -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10WGP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10CU %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode,-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10CU %s ; GCN-LABEL: {{^}}system_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll @@ -1,9 +1,9 @@ -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -mattr=-xnack -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-xnack -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,-xnack -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode,-xnack -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll @@ -1,9 +1,9 @@ -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -mattr=-xnack -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-xnack -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,-xnack -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode,-xnack -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -85,10 +85,10 @@ ; SI: s_min_i32 ; SI: s_min_i32 -; VI: s_min_i32 -; VI: s_min_i32 -; VI: s_min_i32 -; VI: v_min_i32_sdwa +; VI-DAG: s_min_i32 +; VI-DAG: s_min_i32 +; VI-DAG: s_min_i32 +; VI-DAG: v_min_i32_sdwa ; GFX9_10: v_min_i16 ; GFX9_10: v_min_i16 @@ -495,7 +495,9 @@ ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: ; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} ; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} -; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] +; GCN-DAG: s_and_b32 [[C:s[0-9]+]], [[A]], {{s[0-9]+}} +; GCN-DAG: s_and_b32 [[D:s[0-9]+]], [[B]], {{s[0-9]+}} +; GCN: s_min_u32 [[MIN:s[0-9]+]], [[C]], [[D]] ; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] ; GCN: buffer_store_dword [[VMIN]] diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -93,6 +93,8 @@ ; GFX9-NEXT: s_or_b64 s[12:13], s[4:5], s[12:13] ; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], v10, v8 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v11, v9, s[4:5] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v8, v[8:9], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[6:7] @@ -155,10 +157,16 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: v_mad_u32_u24 v0, v0, v1, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -189,11 +197,15 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v35, s34, 4 ; GFX9-NEXT: s_mov_b32 s34, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x800 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s34 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s34 ; 4-byte Folded Spill @@ -220,12 +232,15 @@ ; GFX9-NEXT: v_readlane_b32 s5, v35, 3 ; GFX9-NEXT: v_readlane_b32 s37, v35, 1 ; GFX9-NEXT: v_readlane_b32 s36, v35, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s34 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s34 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_sub_u32 s32, s32, 0x800 ; GFX9-NEXT: v_readlane_b32 s34, v35, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -28,7 +28,7 @@ ; GCN-NOT: bfe ; GCN-NOT: ashr ; GCN: v_mul_hi_i32_i24_e32 [[RESULT:v[0-9]+]], -; GCN-NEXT: buffer_store_dword [[RESULT]] +; GCN: buffer_store_dword [[RESULT]] ; EG: ASHR ; EG: ASHR diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -101,7 +101,7 @@ ; FUNC-LABEL: {{^}}test_umulhi24_i32_i64: ; GCN-NOT: and ; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]], -; GCN-NEXT: buffer_store_dword [[RESULT]] +; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) { entry: %a.24 = and i32 %a, 16777215 @@ -118,7 +118,7 @@ ; FUNC-LABEL: {{^}}test_umulhi24: ; GCN-NOT: and ; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]], -; GCN-NEXT: buffer_store_dword [[RESULT]] +; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) { entry: %a.24 = and i64 %a, 16777215 @@ -206,7 +206,7 @@ ; GCN-NOT: lshr ; GCN: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]], ; GCN: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]] -; GCN-NEXT: buffer_store_dword v[[HI]] +; GCN: buffer_store_dword v[[HI]] define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) { entry: %tmp0 = shl i33 %a, 9 diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -12,7 +12,7 @@ ; Spill CSR VGPR used for SGPR spilling ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-DAG: v_writelane_b32 v32, s34, 2 ; GCN-DAG: s_mov_b32 s34, s32 @@ -28,7 +28,7 @@ ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s34, v32, 2 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll --- a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll +++ b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}sample_contig_nsa: ; GCN-DAG: image_sample_c_l v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], diff --git a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir --- a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: hazard_image_sample_d_buf_off6 # GCN: IMAGE_SAMPLE diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll --- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll +++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GCN,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GCN,GFX1010,GFX1010W32 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX1010,GFX1010W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack < %s | FileCheck --check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack < %s | FileCheck --check-prefixes=GCN,GFX1010,GFX1010W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64,-xnack < %s | FileCheck --check-prefixes=GCN,GFX1010,GFX1010W64 %s ; GCN-LABEL: {{^}}max_occupancy: ; GFX9: ; Occupancy: 10 diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -20,6 +20,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, 1 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -43,6 +45,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -66,6 +70,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -80,6 +86,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -91,6 +99,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -105,6 +115,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -116,6 +128,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -130,6 +144,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -141,6 +157,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -155,6 +173,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -166,6 +186,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -189,6 +211,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -203,6 +227,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -214,6 +240,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -228,6 +256,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -239,6 +269,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x3fff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -253,6 +285,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -264,6 +298,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -278,6 +314,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -289,6 +327,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -303,6 +343,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -314,6 +356,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffc000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -329,6 +373,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -340,6 +386,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -355,6 +403,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -366,6 +416,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -381,6 +433,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -392,6 +446,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -407,6 +463,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -418,6 +476,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -433,6 +493,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -444,6 +506,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -459,6 +523,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -470,6 +536,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -486,6 +554,8 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -497,6 +567,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -513,6 +585,8 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -524,6 +598,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -540,6 +616,8 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -551,6 +629,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -567,6 +647,8 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -578,6 +660,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -594,6 +678,8 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -605,6 +691,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -621,6 +709,8 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -632,6 +722,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -647,8 +739,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -661,8 +757,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 1 @@ -678,8 +778,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -692,8 +796,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 2047 @@ -709,8 +817,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -723,8 +835,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 4095 @@ -742,8 +858,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -756,8 +876,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8191 @@ -775,8 +899,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -789,8 +917,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -2048 @@ -808,8 +940,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -822,8 +958,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -4096 @@ -841,8 +981,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -855,8 +999,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -8192 @@ -872,8 +1020,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -886,8 +1038,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 4095 @@ -905,8 +1061,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -919,8 +1079,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8191 @@ -938,8 +1102,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -952,8 +1120,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 16383 @@ -971,8 +1143,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -985,8 +1161,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -4096 @@ -1004,8 +1184,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -1018,8 +1202,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -8192 @@ -1037,8 +1225,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -1051,8 +1243,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -16384 @@ -1070,8 +1266,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -1084,8 +1284,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8589936639 @@ -1103,8 +1307,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -1117,8 +1325,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8589936640 @@ -1136,8 +1348,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -1150,8 +1366,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8589938687 @@ -1170,8 +1390,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -1184,8 +1408,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8589938688 @@ -1204,8 +1432,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -1218,8 +1450,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8589942783 @@ -1238,8 +1474,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -1252,8 +1492,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8589942784 @@ -1272,8 +1516,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -1286,8 +1534,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -9223372036854773761 @@ -1306,8 +1558,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -1320,8 +1576,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -9223372036854773760 @@ -1340,8 +1600,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -1354,8 +1618,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -9223372036854771713 @@ -1375,8 +1643,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -1389,8 +1661,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -9223372036854771712 @@ -1410,8 +1686,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -1424,8 +1704,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -9223372036854767617 @@ -1445,8 +1729,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm ; @@ -1459,8 +1747,12 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -9223372036854767616 diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -62,6 +62,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -76,6 +78,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -87,6 +91,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -131,6 +137,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -145,6 +153,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -156,6 +166,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -179,6 +191,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -193,6 +207,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -204,6 +220,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -218,6 +236,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -229,6 +249,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x3800, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -252,6 +274,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -266,6 +290,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -277,6 +303,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -291,6 +319,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -302,6 +332,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffc000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -317,6 +349,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -328,6 +362,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -343,6 +379,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -354,6 +392,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -369,6 +409,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -380,6 +422,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -395,6 +439,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -406,6 +452,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -421,6 +469,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -432,6 +482,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -447,6 +499,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -458,6 +512,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -474,6 +530,8 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -485,6 +543,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -501,6 +561,8 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -512,6 +574,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -528,6 +592,8 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -539,6 +605,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -555,6 +623,8 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -566,6 +636,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -582,6 +654,8 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -593,6 +667,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -609,6 +685,8 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -620,6 +698,8 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -635,8 +715,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -647,8 +731,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1 @@ -664,8 +752,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -676,8 +768,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047 @@ -693,8 +789,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -705,8 +805,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 @@ -724,8 +828,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -736,8 +844,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 @@ -753,8 +865,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -765,8 +881,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048 @@ -782,8 +902,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -794,8 +918,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xfffff000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 @@ -813,8 +941,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -825,8 +957,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffe000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 @@ -842,8 +978,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -854,8 +994,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 @@ -873,8 +1017,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -885,8 +1033,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 @@ -904,8 +1056,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -916,8 +1072,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x3800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383 @@ -933,8 +1093,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -945,8 +1109,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xfffff000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 @@ -964,8 +1132,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -976,8 +1148,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffe000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 @@ -995,8 +1171,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -1007,8 +1187,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffc000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384 @@ -1026,8 +1210,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -1038,8 +1226,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639 @@ -1057,8 +1249,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -1069,8 +1265,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640 @@ -1088,8 +1288,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -1100,8 +1304,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687 @@ -1120,8 +1328,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -1132,8 +1344,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688 @@ -1152,8 +1368,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -1164,8 +1384,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783 @@ -1184,8 +1408,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -1196,8 +1424,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x2000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784 @@ -1216,8 +1448,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -1229,8 +1465,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761 @@ -1249,8 +1489,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -1262,8 +1506,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760 @@ -1282,8 +1530,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -1295,8 +1547,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713 @@ -1316,8 +1572,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -1329,8 +1589,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712 @@ -1350,8 +1614,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -1363,8 +1631,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617 @@ -1384,8 +1656,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; @@ -1397,8 +1673,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -94,9 +94,9 @@ ; FUNC-LABEL: {{^}}scalar_or_inline_imm_i64: ; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} ; SI-NOT: or_b32 -; SI: s_or_b32 s[[VAL_LO]], s[[VAL_LO]], 63 +; SI: s_or_b32 s[[VAL:[0-9]+]], s[[VAL_LO]], 63 ; SI-NOT: or_b32 -; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[VAL_LO]] +; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[VAL]] ; SI-NOT: or_b32 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]] ; SI-NOT: or_b32 @@ -120,8 +120,8 @@ } ; FUNC-LABEL: {{^}}scalar_or_neg_inline_imm_i64: -; SI-DAG: s_load_dword [[VAL:s[0-9]+]] -; SI-DAG: s_or_b32 [[VAL]], [[VAL]], -8 +; SI-DAG: s_load_dword [[LOAD:s[0-9]+]] +; SI-DAG: s_or_b32 [[VAL:s[0-9]+]], [[LOAD]], -8 ; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}} ; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]] ; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll b/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll --- a/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll @@ -46,12 +46,12 @@ } ; GCN-LABEL: reassoc_v2i32: -; GCN: s_add_i32 [[ADD1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_add_i32 [[ADD2:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} -; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}} -; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD2]], v{{[0-9]+}} -; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}} -; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD2]], v{{[0-9]+}} +; GCN-DAG: s_add_i32 [[ADD1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_add_i32 [[ADD2:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} +; GFX8-DAG: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}} +; GFX8-DAG: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD2]], v{{[0-9]+}} +; GFX9-DAG: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}} +; GFX9-DAG: v_add_u32_e32 v{{[0-9]+}}, [[ADD2]], v{{[0-9]+}} define amdgpu_kernel void @reassoc_v2i32(<2 x i32> addrspace(1)* %arg, <2 x i32> %x, <2 x i32> %y) { bb: %t1 = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll --- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; TODO: Some of those tests fail with OS == amdhsa due to unreasonable register ; allocation differences. diff --git a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll --- a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}s_mulk_i32_k0: ; SI: s_load_dword [[VAL:s[0-9]+]] diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -37,42 +37,46 @@ ; VI-LABEL: saddo_i64_zext: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: s_add_u32 s2, s6, s0 -; VI-NEXT: s_addc_u32 s3, s7, s1 +; VI-NEXT: s_add_u32 s0, s6, s2 +; VI-NEXT: s_addc_u32 s1, s7, s3 ; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] -; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[1:2] +; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_xor_b64 s[2:3], s[8:9], vcc +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: saddo_i64_zext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: s_add_u32 s2, s6, s0 -; GFX9-NEXT: s_addc_u32 s3, s7, s1 +; GFX9-NEXT: s_add_u32 s0, s6, s2 +; GFX9-NEXT: s_addc_u32 s1, s7, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[1:2] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind @@ -113,40 +117,48 @@ ; VI-LABEL: s_saddo_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s1, 0 -; VI-NEXT: s_add_i32 s1, s0, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, s1, v4 -; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], s3, 0 +; VI-NEXT: s_add_i32 s3, s2, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, s3, v4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc -; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_byte v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_saddo_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], s1, 0 -; GFX9-NEXT: s_add_i32 s1, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, s1, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], s3, 0 +; GFX9-NEXT: s_add_i32 s3, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, s3, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc -; GFX9-NEXT: global_store_dword v[0:1], v4, off +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_store_dword v[0:1], v4, off ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[2:3], v0, off ; GFX9-NEXT: s_endpgm %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind @@ -190,23 +202,29 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v6, s4 ; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: flat_load_dword v6, v[6:7] -; VI-NEXT: flat_load_dword v4, v[4:5] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dword v8, v[6:7] +; VI-NEXT: flat_load_dword v9, v[4:5] ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v6 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4 -; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v5, v6 +; VI-NEXT: v_add_u32_e32 v4, vcc, v9, v8 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v9 +; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v4, v8 ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: flat_store_dword v[2:3], v5 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_store_dword v[2:3], v4 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -214,23 +232,29 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: v_mov_b32_e32 v6, s4 ; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: global_load_dword v6, v[6:7], off -; GFX9-NEXT: global_load_dword v4, v[4:5], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v8, v[6:7], off +; GFX9-NEXT: global_load_dword v9, v[4:5], off ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v5, v6, v4 -; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], v5, v6 +; GFX9-NEXT: v_add_u32_e32 v4, v8, v9 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v9 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], v4, v8 ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: global_store_dword v[2:3], v5, off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_store_dword v[2:3], v4, off ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm %a = load i32, i32 addrspace(1)* %aptr, align 4 @@ -286,8 +310,12 @@ ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -308,8 +336,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind @@ -354,24 +386,31 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_mov_b32_e32 v6, s4 ; VI-NEXT: v_mov_b32_e32 v7, s5 -; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7] -; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[8:9], v[6:7] +; VI-NEXT: flat_load_dwordx2 v[10:11], v[4:5] ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v8, vcc, v6, v4 -; VI-NEXT: v_addc_u32_e32 v9, vcc, v7, v5, vcc -; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] -; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[6:7] -; VI-NEXT: flat_store_dwordx2 v[2:3], v[8:9] +; VI-NEXT: v_add_u32_e32 v4, vcc, v8, v10 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v9, v11, vcc +; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] +; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[8:9] +; VI-NEXT: s_nop 0 ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -379,24 +418,31 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s4 ; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[6:7], off -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[4:5], off +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[6:7], off +; GFX9-NEXT: global_load_dwordx2 v[10:11], v[4:5], off ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v5, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[6:7] -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[8:9], off +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v11, vcc +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[8:9] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm %a = load i64, i64 addrspace(1)* %aptr, align 4 @@ -447,28 +493,34 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_mov_b32_e32 v6, s4 ; VI-NEXT: v_mov_b32_e32 v7, s5 -; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7] -; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[8:9], v[6:7] +; VI-NEXT: flat_load_dwordx2 v[10:11], v[4:5] ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v9, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v8, vcc, v6, v4 -; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v5 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v7 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4 -; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, v9, v11 +; VI-NEXT: v_add_u32_e32 v4, vcc, v8, v10 +; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v11 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v9 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v10 +; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v8 ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; VI-NEXT: flat_store_dwordx2 v[2:3], v[8:9] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; @@ -476,28 +528,34 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s4 ; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[6:7], off -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[4:5], off +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[6:7], off +; GFX9-NEXT: global_load_dwordx2 v[10:11], v[4:5], off ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v9, v7, v5 -; GFX9-NEXT: v_add_u32_e32 v8, v6, v4 -; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v7 -; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v6 +; GFX9-NEXT: v_add_u32_e32 v5, v9, v11 +; GFX9-NEXT: v_add_u32_e32 v4, v8, v10 +; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v11 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v9 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v10 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v8 ; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[8:9], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -37,12 +37,16 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tmp1 = load i32, i32 addrspace(1)* %in, align 4 @@ -86,12 +90,16 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tmp1 = load float, float addrspace(1)* %in, align 4 @@ -123,6 +131,8 @@ ; VI: ; %bb.0: ; %bb ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 @@ -133,6 +143,8 @@ ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm bb: @@ -165,6 +177,8 @@ ; VI: ; %bb.0: ; %bb ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 @@ -175,6 +189,8 @@ ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm bb: @@ -239,11 +255,13 @@ ; VI-LABEL: scalar_to_vector_test6: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -97,8 +97,8 @@ ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; SIVI-NOT: s_mov_b32 s6 -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen +; SIVI-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen +; SIVI-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen ; GFX9_10-NOT: s_mov_b32 s5 ; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen @@ -117,13 +117,14 @@ ; GCN-LABEL: {{^}}gs_ir_uses_scratch_offset: ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen +; SIVI-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen +; SIVI-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen + +; GCN-DAG: s_mov_b32 s2, s5 ; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen ; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GCN-DAG: s_mov_b32 s2, s5 define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -73,9 +73,11 @@ ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_mov_b32 s0, s10 ; TONGA-NEXT: s_mov_b32 s1, s11 -; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; TONGA-NEXT: s_mov_b32 s4, s8 ; TONGA-NEXT: s_mov_b32 s5, s9 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v2, v1 @@ -109,6 +111,8 @@ ; TONGA-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; @@ -122,9 +126,11 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s8, s2 ; GFX9-NEXT: s_mov_b32 s9, s3 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 @@ -158,6 +164,8 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -247,12 +255,16 @@ ; TONGA-NEXT: s_mov_b32 s5, s7 ; TONGA-NEXT: s_mov_b32 s6, s2 ; TONGA-NEXT: s_mov_b32 s7, s3 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; @@ -268,12 +280,16 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s2 ; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 30, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -341,9 +357,11 @@ ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_mov_b32 s8, s6 ; TONGA-NEXT: s_mov_b32 s9, s7 -; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; TONGA-NEXT: s_mov_b32 s0, 0x98a1930b ; TONGA-NEXT: s_mov_b32 s1, s5 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_mul_hi_i32 v1, v0, s0 ; TONGA-NEXT: s_mov_b32 s0, s4 @@ -351,6 +369,8 @@ ; TONGA-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 11, v0 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; @@ -364,9 +384,11 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s8, s6 ; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX9-NEXT: s_mov_b32 s0, 0x98a1930b ; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0 ; GFX9-NEXT: s_mov_b32 s0, s4 @@ -374,6 +396,8 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 11, v0 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -501,6 +525,8 @@ ; TONGA-NEXT: s_mov_b32 s1, s3 ; TONGA-NEXT: s_mov_b32 s2, s10 ; TONGA-NEXT: s_mov_b32 s3, s11 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 @@ -567,6 +593,8 @@ ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v9 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 ; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v9 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; TONGA-NEXT: s_endpgm ; @@ -583,6 +611,8 @@ ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s10 ; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v2 @@ -649,6 +679,8 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v6 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v5 ; GFX9-NEXT: v_sub_u32_e32 v1, v1, v6 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; @@ -772,6 +804,8 @@ ; TONGA-NEXT: s_mov_b32 s5, s7 ; TONGA-NEXT: s_mov_b32 s6, s2 ; TONGA-NEXT: s_mov_b32 s7, s3 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v0 @@ -782,6 +816,8 @@ ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; @@ -797,6 +833,8 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s2 ; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0 @@ -807,6 +845,8 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1002,6 +1042,8 @@ ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_mov_b32 s0, s14 ; TONGA-NEXT: s_mov_b32 s1, s15 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; TONGA-NEXT: s_mov_b32 s14, 0x4f800000 @@ -1137,6 +1179,8 @@ ; TONGA-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v6 ; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; TONGA-NEXT: s_endpgm ; @@ -1150,6 +1194,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, s10 ; GFX9-NEXT: s_mov_b32 s1, s11 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; GFX9-NEXT: s_mov_b32 s4, 0x4f800000 @@ -1285,6 +1331,8 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX9-NEXT: s_endpgm ; @@ -1478,6 +1526,8 @@ ; TONGA-NEXT: s_mov_b32 s5, s7 ; TONGA-NEXT: s_mov_b32 s6, s2 ; TONGA-NEXT: s_mov_b32 s7, s3 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 @@ -1496,6 +1546,8 @@ ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 2, v2 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 2, v3 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; @@ -1511,6 +1563,8 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s2 ; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0 @@ -1529,6 +1583,8 @@ ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 2, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 2, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1620,6 +1676,8 @@ ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_mov_b32 s8, s6 ; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 ; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:1 ; TONGA-NEXT: s_mov_b32 s0, s4 @@ -1640,6 +1698,8 @@ ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 8 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; @@ -1653,6 +1713,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s8, s6 ; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 ; GFX9-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:1 ; GFX9-NEXT: s_mov_b32 s0, s4 @@ -1673,6 +1735,8 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1775,6 +1839,8 @@ ; TONGA-NEXT: s_mov_b32 s5, s7 ; TONGA-NEXT: s_mov_b32 s6, s2 ; TONGA-NEXT: s_mov_b32 s7, s3 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; TONGA-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:2 ; TONGA-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 @@ -1801,6 +1867,8 @@ ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; @@ -1816,6 +1884,8 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s2 ; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX9-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:2 ; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 @@ -1842,6 +1912,8 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1956,6 +2028,8 @@ ; TONGA-NEXT: s_mov_b32 s5, s7 ; TONGA-NEXT: s_mov_b32 s6, s2 ; TONGA-NEXT: s_mov_b32 s7, s3 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; TONGA-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2 ; TONGA-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 @@ -1980,6 +2054,8 @@ ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 24 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; @@ -1995,6 +2071,8 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s2 ; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX9-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2 ; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 @@ -2019,6 +2097,8 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -2154,9 +2234,11 @@ ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_mov_b32 s0, s10 ; TONGA-NEXT: s_mov_b32 s1, s11 -; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; TONGA-NEXT: s_mov_b32 s4, s8 ; TONGA-NEXT: s_mov_b32 s5, s9 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_bfe_i32 v2, v1, 0, 25 ; TONGA-NEXT: v_bfe_i32 v1, v1, 24, 1 @@ -2193,6 +2275,8 @@ ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; @@ -2206,9 +2290,11 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s8, s2 ; GFX9-NEXT: s_mov_b32 s9, s3 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfe_i32 v2, v1, 0, 25 ; GFX9-NEXT: v_bfe_i32 v1, v1, 24, 1 @@ -2245,6 +2331,8 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 25 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -2378,12 +2466,14 @@ ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 -; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; TONGA-NEXT: s_mov_b32 s0, 0x1389c755 ; TONGA-NEXT: s_mov_b32 s4, s6 ; TONGA-NEXT: s_mov_b32 s5, s7 ; TONGA-NEXT: s_mov_b32 s6, s2 ; TONGA-NEXT: s_mov_b32 s7, s3 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; TONGA-NEXT: s_mov_b32 s0, 0x1389c755 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_mul_hi_i32 v0, v0, s0 ; TONGA-NEXT: v_mul_hi_i32 v1, v1, s0 @@ -2401,6 +2491,8 @@ ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 +; TONGA-NEXT: s_nop 0 +; TONGA-NEXT: s_nop 0 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; @@ -2412,12 +2504,14 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: s_mov_b32 s0, 0x1389c755 ; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s2 ; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s0, 0x1389c755 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_hi_i32 v0, v0, s0 ; GFX9-NEXT: v_mul_hi_i32 v1, v1, s0 @@ -2435,6 +2529,8 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll @@ -713,7 +713,7 @@ ; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], -4.0, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc -; GCN-NEXT: buffer_store_dword [[SELECT]] +; GCN: buffer_store_dword [[SELECT]] define amdgpu_kernel void @select_fneg_posk_src_add_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -730,7 +730,7 @@ ; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], 4.0, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc -; GCN-NEXT: buffer_store_dword [[SELECT]] +; GCN: buffer_store_dword [[SELECT]] define amdgpu_kernel void @select_fneg_posk_src_sub_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -746,7 +746,7 @@ ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc -; GCN-NEXT: buffer_store_dword [[SELECT]] +; GCN: buffer_store_dword [[SELECT]] define amdgpu_kernel void @select_fneg_posk_src_mul_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -763,7 +763,7 @@ ; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[X]], -4.0, -[[Z]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[FMA]], vcc -; GCN-NEXT: buffer_store_dword [[SELECT]] +; GCN: buffer_store_dword [[SELECT]] define amdgpu_kernel void @select_fneg_posk_src_fma_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -780,7 +780,7 @@ ; GCN: buffer_load_dword [[Z:v[0-9]+]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[X]], vcc -; GCN-NEXT: buffer_store_dword [[SELECT]] +; GCN: buffer_store_dword [[SELECT]] define amdgpu_kernel void @select_fneg_posk_src_fmad_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -799,7 +799,7 @@ ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]] -; GCN-NEXT: buffer_store_dword [[NEG_SELECT]] +; GCN: buffer_store_dword [[NEG_SELECT]] define amdgpu_kernel void @select_fneg_posk_src_rcp_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll --- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll +++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll @@ -179,8 +179,8 @@ } ; GCN-LABEL: {{^}}v_select_v4i32: -; GCN: buffer_load_dwordx4 -; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 +; GCN-DAG: buffer_load_dwordx4 +; GCN-DAG: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} @@ -265,8 +265,8 @@ } ; GCN-LABEL: {{^}}v_select_v4f32: -; GCN: buffer_load_dwordx4 -; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 +; GCN-DAG: buffer_load_dwordx4 +; GCN-DAG: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -64,6 +64,8 @@ ; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; VI-NEXT: buffer_load_ushort v2, off, s[16:19], 0 ; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 @@ -71,6 +73,8 @@ ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm half addrspace(1)* %r, @@ -141,6 +145,8 @@ ; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 @@ -148,6 +154,8 @@ ; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm half addrspace(1)* %r, @@ -216,6 +224,8 @@ ; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 @@ -223,6 +233,8 @@ ; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm half addrspace(1)* %r, @@ -291,6 +303,8 @@ ; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 @@ -299,6 +313,8 @@ ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm half addrspace(1)* %r, @@ -367,6 +383,8 @@ ; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 @@ -375,6 +393,8 @@ ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm half addrspace(1)* %r, @@ -466,22 +486,26 @@ ; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 -; VI-NEXT: buffer_load_dword v3, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v3, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm <2 x half> addrspace(1)* %r, @@ -564,6 +588,8 @@ ; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 @@ -579,6 +605,8 @@ ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm <2 x half> addrspace(1)* %r, @@ -659,6 +687,8 @@ ; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 @@ -674,6 +704,8 @@ ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm <2 x half> addrspace(1)* %r, @@ -753,6 +785,8 @@ ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; VI-NEXT: buffer_load_dword v4, off, s[4:7], 0 @@ -769,6 +803,8 @@ ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm <2 x half> addrspace(1)* %r, @@ -849,6 +885,8 @@ ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; VI-NEXT: buffer_load_dword v4, off, s[4:7], 0 @@ -865,6 +903,8 @@ ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm <2 x half> addrspace(1)* %r, diff --git a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll --- a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}const_load_no_shrink_dword_to_unaligned_byte: ; GCN: s_load_dword [[LD:s[0-9]+]], diff --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll --- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll @@ -6,7 +6,7 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_ne_u32_e32 vcc, ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT:buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W @@ -23,7 +23,7 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_ne_u32_e32 vcc, ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W @@ -40,7 +40,7 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_eq_u32_e32 vcc, ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b @@ -54,7 +54,7 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_eq_u32_e32 vcc, ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b @@ -68,7 +68,7 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_ne_u32_e32 vcc, ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b @@ -82,7 +82,7 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_ne_u32_e32 vcc, ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b @@ -96,7 +96,7 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_eq_u32_e32 vcc, ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b @@ -110,7 +110,7 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_eq_u32_e32 vcc, ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] define amdgpu_kernel void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = zext i1 %icmp0 to i32 diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,12 +8,14 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -21,21 +23,23 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x30 -; VI-NEXT: s_mov_b32 s3, 0xffff +; VI-NEXT: s_load_dword s3, s[0:1], 0x30 +; VI-NEXT: s_mov_b32 s1, 0xffff ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_lshr_b32 s8, s0, 16 -; VI-NEXT: s_and_b32 s2, s2, s3 -; VI-NEXT: s_and_b32 s0, s0, s3 -; VI-NEXT: s_lshl_b32 s0, s2, s0 -; VI-NEXT: s_lshl_b32 s1, s1, s8 -; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_and_b32 s0, s0, s3 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: s_lshr_b32 s8, s3, 16 +; VI-NEXT: s_and_b32 s2, s2, s1 +; VI-NEXT: s_and_b32 s3, s3, s1 +; VI-NEXT: s_lshl_b32 s0, s0, s8 +; VI-NEXT: s_lshl_b32 s2, s2, s3 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_and_b32 s1, s2, s1 +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -73,6 +77,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 @@ -80,6 +86,8 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v2, v4, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -93,15 +101,19 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5 -; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_lshlrev_b16_e32 v2, v6, v5 +; VI-NEXT: v_lshlrev_b16_sdwa v3, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -146,40 +158,48 @@ ; GFX9-LABEL: shl_v_s_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v2, s0, v3 +; GFX9-NEXT: v_pk_lshlrev_b16 v2, s2, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_s_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, s0, v3 +; VI-NEXT: v_lshlrev_b16_e32 v2, s2, v3 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -221,40 +241,48 @@ ; GFX9-LABEL: shl_s_v_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, s0 +; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, s2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_s_v_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e64 v2, v3, s0 +; VI-NEXT: v_lshlrev_b16_e64 v2, v3, s2 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -301,12 +329,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, 8 op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -319,6 +351,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -327,6 +361,8 @@ ; VI-NEXT: v_lshlrev_b16_e64 v2, v4, 8 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -370,12 +406,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -387,6 +427,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -396,6 +438,8 @@ ; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2 ; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -429,45 +473,53 @@ ; GFX9-LABEL: v_shl_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v3 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_pk_lshlrev_b16 v3, v5, v3 +; GFX9-NEXT: v_pk_lshlrev_b16 v2, v4, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_shl_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v8 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1 -; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0 -; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v1, v6, v1 -; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: v_lshlrev_b16_e32 v2, v7, v5 +; VI-NEXT: v_lshlrev_b16_sdwa v3, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v5, v6, v4 +; VI-NEXT: v_lshlrev_b16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v3, v2, v3 +; VI-NEXT: v_or_b32_e32 v2, v5, v4 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_shl_v4i16: @@ -524,13 +576,17 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -543,10 +599,12 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 @@ -556,6 +614,8 @@ ; VI-NEXT: v_and_b32_e32 v4, s4, v4 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -345,9 +345,8 @@ ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_private: ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 2, v0 -; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen offset:16 - -; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 3, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen offset:16 +; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 3, v0 ; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s33 offen offset:32 define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 { %idx = zext i16 %idx.arg to i32 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -32,12 +32,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -49,12 +53,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -66,11 +74,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v3 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -112,15 +124,21 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; VI-NEXT: v_subrev_u32_e32 v1, vcc, 64, v4 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_subrev_u32_e32 v0, vcc, 64, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[2:3], v1 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -133,15 +151,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[2:3], v1, off ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm @@ -154,6 +178,8 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 @@ -162,6 +188,8 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v4 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: global_store_dword v[0:1], v3, off ; GFX10-NEXT: s_endpgm @@ -203,12 +231,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_u32_e32 v2, vcc, 64, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -220,12 +252,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v2, 64, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -237,11 +273,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v3 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -279,12 +319,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 0xffffffbf, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -296,12 +340,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffbf, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -313,11 +361,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffbf, v3 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -355,12 +407,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_u32_e32 v2, vcc, 0x41, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -372,12 +428,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v2, 0x41, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -389,11 +449,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x41, v3 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -431,12 +495,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -448,12 +516,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, 16, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -465,11 +537,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v2, 16, v3 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -507,12 +583,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_u32_e32 v2, vcc, -16, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -524,12 +604,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v2, -16, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -541,11 +625,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v2, -16, v3 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -583,12 +671,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 17, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -600,12 +692,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, 17, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -617,11 +713,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v2, 17, v3 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -659,12 +759,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_u32_e32 v2, vcc, 0xffffffef, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -676,12 +780,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v2, 0xffffffef, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -693,11 +801,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0xffffffef, v3 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -781,12 +893,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_ushort v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_subrev_u16_e32 v2, 64, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -798,12 +914,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ushort v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -815,11 +935,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u16_e64 v2, v3, 64 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -860,12 +984,16 @@ ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_ushort v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_subrev_u16_e32 v2, 64, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -878,12 +1006,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ushort v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -897,11 +1029,15 @@ ; GFX10-NEXT: v_add_co_u32_e64 v1, s2, s2, v1 ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ushort v1, v[1:2], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u16_e64 v2, v1, 64 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -944,15 +1080,21 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ushort v4, v[0:1] -; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_ushort v4, v[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; VI-NEXT: v_subrev_u16_e32 v1, 64, v4 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_short v[2:3], v1 ; VI-NEXT: flat_store_short v[2:3], v0 ; VI-NEXT: s_endpgm @@ -965,15 +1107,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ushort v4, v[0:1], off -; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_ushort v4, v[0:1], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u16_e32 v0, 64, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_short v[2:3], v1, off ; GFX9-NEXT: global_store_short v[2:3], v0, off ; GFX9-NEXT: s_endpgm @@ -986,6 +1134,8 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-NEXT: global_load_ushort v4, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 @@ -994,6 +1144,8 @@ ; GFX10-NEXT: v_sub_nc_u16_e64 v2, v3, 64 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u16_e64 v3, v4, 64 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: global_store_short v[0:1], v3, off ; GFX10-NEXT: s_endpgm @@ -1040,6 +1192,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1048,6 +1202,8 @@ ; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_subrev_u16_e32 v3, 64, v4 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1059,12 +1215,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1076,11 +1236,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0] +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1123,6 +1287,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1131,6 +1297,8 @@ ; VI-NEXT: v_add_u16_e32 v2, -7, v4 ; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1143,12 +1311,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1160,11 +1332,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v2, v3, 7 op_sel_hi:[1,0] +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1207,6 +1383,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1215,6 +1393,8 @@ ; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_subrev_u16_e32 v3, 64, v4 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1227,12 +1407,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1244,11 +1428,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0] +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1290,6 +1478,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1298,6 +1488,8 @@ ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; VI-NEXT: v_add_u16_e32 v3, -7, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1309,12 +1501,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v2, v3, 7 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1326,11 +1522,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v2, v3, 7 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1370,6 +1570,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1377,6 +1579,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1388,12 +1592,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1405,11 +1613,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1448,6 +1660,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1455,6 +1669,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1466,12 +1682,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v2, v3, -4.0 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1483,11 +1703,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v2, v3, -4.0 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1526,6 +1750,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1533,6 +1759,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1544,12 +1772,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v2, v3, 4.0 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1561,11 +1793,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v2, v3, 4.0 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1609,6 +1845,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1617,6 +1855,8 @@ ; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_subrev_u16_e32 v3, 32, v4 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1628,12 +1868,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1645,11 +1889,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel_hi:[1,0] +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1688,6 +1936,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1695,6 +1945,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1706,12 +1958,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1723,11 +1979,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1768,6 +2028,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1776,6 +2038,8 @@ ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; VI-NEXT: v_subrev_u16_e32 v3, 32, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1787,12 +2051,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1804,11 +2072,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v2, v3, 32 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1852,6 +2124,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1860,6 +2134,8 @@ ; VI-NEXT: v_add_u16_e32 v2, -16, v4 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1871,12 +2147,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1888,11 +2168,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel_hi:[1,0] +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1931,6 +2215,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1938,6 +2224,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1949,12 +2237,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1966,11 +2258,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2011,6 +2307,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2019,6 +2317,8 @@ ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; VI-NEXT: v_add_u16_e32 v3, -16, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2030,12 +2330,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2047,11 +2351,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v2, v3, 16 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2094,6 +2402,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2102,6 +2412,8 @@ ; VI-NEXT: v_add_u16_e32 v2, 0xffffc400, v4 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2113,12 +2425,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 1.0 op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2130,11 +2446,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v2, v3, 1.0 op_sel_hi:[1,0] +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2177,6 +2497,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2185,6 +2507,8 @@ ; VI-NEXT: v_add_u16_e32 v2, 4.0, v4 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2196,12 +2520,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v2, v3, -1.0 op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2213,11 +2541,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v2, v3, -1.0 op_sel_hi:[1,0] +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2260,6 +2592,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2268,6 +2602,8 @@ ; VI-NEXT: v_add_u16_e32 v2, 2.0, v4 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2279,12 +2615,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v2, v3, -2.0 op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2296,11 +2636,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v2, v3, -2.0 op_sel_hi:[1,0] +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2343,6 +2687,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2351,6 +2697,8 @@ ; VI-NEXT: v_add_u16_e32 v2, 0xffffc000, v4 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2362,12 +2710,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 2.0 op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2379,11 +2731,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v2, v3, 2.0 op_sel_hi:[1,0] +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2423,12 +2779,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2440,12 +2800,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2457,11 +2821,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2500,12 +2868,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_subrev_u16_e32 v2, 32, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2517,12 +2889,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2534,11 +2910,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v2, v3, 32 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI,MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global,-xnack -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI,MESA %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI,MESA %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global,-xnack -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,MESA %s target datalayout = "A5" ; FIXME: Why is this commuted only sometimes? @@ -203,7 +203,7 @@ ; Have another non-tail in the function ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec ; GCN: s_mov_b32 s34, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400 diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -19,13 +19,15 @@ ; VI-LABEL: s_sext_i1_to_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b @@ -53,15 +55,17 @@ ; VI-LABEL: test_s_sext_i32_to_i64: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: s_add_i32 s0, s0, s2 +; VI-NEXT: s_mul_i32 s0, s8, s9 +; VI-NEXT: s_add_i32 s0, s0, s10 ; VI-NEXT: s_ashr_i32 s1, s0, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm entry: @@ -90,14 +94,16 @@ ; VI-LABEL: s_sext_i1_to_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b @@ -123,13 +129,15 @@ ; VI-LABEL: s_sext_i32_to_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s1, s0, 31 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_ashr_i32 s0, s2, 31 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %sext = sext i32 %a to i64 @@ -168,9 +176,13 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %val = load i32, i32 addrspace(1)* %in, align 4 @@ -196,13 +208,15 @@ ; VI-LABEL: s_sext_i16_to_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 +; VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %sext = sext i16 %a to i64 @@ -227,13 +241,15 @@ ; VI-LABEL: s_sext_i1_to_i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b @@ -266,16 +282,18 @@ ; VI-LABEL: s_sext_i1_to_i16_with_and: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v0 +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s10, v1 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %cmp0 = icmp eq i32 %a, %b @@ -304,17 +322,19 @@ ; ; VI-LABEL: v_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v1 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %cmp0 = icmp eq i32 %a, %tid @@ -361,21 +381,27 @@ ; VI-LABEL: s_sext_v4i8_to_v4i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s0 -; VI-NEXT: s_ashr_i32 s1, s0, 24 -; VI-NEXT: s_bfe_i32 s2, s0, 0x80010 -; VI-NEXT: s_sext_i32_i8 s0, s0 +; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; VI-NEXT: s_ashr_i32 s0, s2, 24 +; VI-NEXT: s_bfe_i32 s1, s2, 0x80010 +; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 -; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %cast = bitcast i32 %a to <4 x i8> @@ -430,6 +456,8 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0 @@ -437,6 +465,8 @@ ; VI-NEXT: v_bfe_i32 v3, v0, 16, 8 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 ; VI-NEXT: v_bfe_i32 v1, v1, 0, 8 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 @@ -495,13 +525,21 @@ ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_ashr_i32 s4, s7, 16 +; VI-NEXT: s_sext_i32_i16 s7, s7 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: s_sext_i32_i16 s7, s7 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %cast = bitcast i64 %a to <4 x i16> @@ -554,12 +592,16 @@ ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 ; VI-NEXT: v_bfe_i32 v1, v1, 0, 16 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll --- a/llvm/test/CodeGen/AMDGPU/smed3.ll +++ b/llvm/test/CodeGen/AMDGPU/smed3.ll @@ -637,10 +637,10 @@ ; SI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; FIXME: VI not matching med3 -; VI: v_min_i16 -; VI: v_max_i16 -; VI: v_min_i16 -; VI: v_max_i16 +; VI-DAG: v_min_i16 +; VI-DAG: v_max_i16 +; VI-DAG: v_min_i16 +; VI-DAG: v_max_i16 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -120,10 +120,10 @@ ; SMRD load using the load.const.v4i32 intrinsic with the largest possible immediate ; offset. ; GCN-LABEL: {{^}}smrd_load_const1: -; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff -; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff glc ; encoding: [0xff -; VIGFX9_10-DAG: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc ; -; VIGFX9_10-DAG: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc glc ; +; SICI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0xff ; encoding: [0xff +; SICI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0xff glc ; encoding: [0xff +; VIGFX9_10-DAG: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x3fc ; +; VIGFX9_10-DAG: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x3fc glc ; define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 @@ -141,12 +141,12 @@ ; immediate offset. ; GCN-LABEL: {{^}}smrd_load_const2: ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 -; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] -; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] -; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 -; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +; SI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] +; SI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] +; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x100 +; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x100 +; VIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x400 +; VIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x400 define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 @@ -162,12 +162,12 @@ ; SMRD load with the largest possible immediate offset on VI ; GCN-LABEL: {{^}}smrd_load_const3: ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc -; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] -; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] -; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff -; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc +; SI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] +; SI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] +; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x3ffff +; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x3ffff +; VIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0xffffc +; VIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0xffffc define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 @@ -183,10 +183,10 @@ ; SMRD load with an offset greater than the largest possible immediate on VI ; GCN-LABEL: {{^}}smrd_load_const4: ; SIVIGFX9_10: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 -; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] -; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] -; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 -; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 +; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] +; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] +; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x40000 +; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x40000 ; GCN: s_endpgm define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s -; RUN: not llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GCN,GFX900 %s +; RUN: not llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GCN,GFX900 %s ; GCN-LABEL: {{^}}max_10_vgprs: ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 diff --git a/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll b/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll --- a/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll +++ b/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll @@ -1,8 +1,4 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx902 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s -; RUN: llc -march=amdgcn -mcpu=gfx904 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s ; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc < %s | FileCheck -check-prefixes=GCN,ECC %s ; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-sram-ecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -30,12 +30,16 @@ ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: s_cbranch_execz BB0_2 ; GCN-NEXT: ; %bb.1: ; %if.then4.i +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v0, v32, s[36:39], s32 offen ; GCN-NEXT: buffer_load_dword v1, v32, s[36:39], s32 offen offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0 ; GCN-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_store_dword v0, v0, s[36:39], s33 offen ; GCN-NEXT: BB0_2: ; %shader_eval_surface.exit ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll @@ -10,6 +10,8 @@ ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:128 ; VI-NEXT: s_endpgm ; VI-NEXT: .section .rodata,#alloc @@ -54,6 +56,8 @@ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 ; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:128 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .section .rodata,#alloc @@ -105,6 +109,8 @@ ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 ; VI-NEXT: s_endpgm ; VI-NEXT: .section .rodata,#alloc @@ -149,6 +155,8 @@ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 ; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .section .rodata,#alloc @@ -200,6 +208,8 @@ ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 ; VI-NEXT: s_endpgm ; VI-NEXT: .section .rodata,#alloc @@ -244,6 +254,8 @@ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 ; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .section .rodata,#alloc diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -37,13 +37,14 @@ ; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xfffffc00 ; GCN: s_add_u32 s32, s32, 0x2800{{$}} +; GCN: s_sub_u32 s32, s32, 0x2800 + ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen -; GCN: s_sub_u32 s32, s32, 0x2800 ; GCN: ; ScratchSize: 160 define void @needs_align16_stack_align4(i32 %idx) #2 { @@ -58,14 +59,14 @@ ; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xfffff800 ; GCN: s_add_u32 s32, s32, 0x3000{{$}} +; GCN: s_sub_u32 s32, s32, 0x3000 + ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen -; GCN: s_sub_u32 s32, s32, 0x3000 - ; GCN: ; ScratchSize: 192 define void @needs_align32(i32 %idx) #0 { %alloca.align16 = alloca [8 x <4 x i32>], align 32, addrspace(5) @@ -79,8 +80,8 @@ ; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xffffff00 ; GCN: s_add_u32 s32, s32, 0xd00{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen ; GCN: s_sub_u32 s32, s32, 0xd00 +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen ; GCN: ; ScratchSize: 52 define void @force_realign4(i32 %idx) #1 { @@ -133,8 +134,8 @@ ; GCN-NEXT: s_and_b32 s34, [[TMP]], 0xffffe000 ; GCN-NEXT: s_add_u32 s32, s32, 0x4000 ; GCN-NOT: s34 -; GCN: buffer_store_dword v0, off, s[0:3], s34{{$}} ; GCN: s_sub_u32 s32, s32, 0x4000 +; GCN: buffer_store_dword v0, off, s[0:3], s34{{$}} ; GCN: s_mov_b32 s34, [[FP_COPY]] define void @default_realign_align128(i32 %idx) #0 { %alloca.align = alloca i32, align 128, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll @@ -8,8 +8,8 @@ ; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX803-NEXT: flat_store_short v[0:1], v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off +; GFX803: flat_store_short v[0:1], v2 +; GFX906: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -28,8 +28,8 @@ ; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX803-NEXT: flat_store_short v[0:1], v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off +; GFX803: flat_store_short v[0:1], v2 +; GFX906: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -48,8 +48,8 @@ ; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX803-NEXT: flat_store_short v[0:1], v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off +; GFX803: flat_store_short v[0:1], v2 +; GFX906: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -67,8 +67,8 @@ ; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX803-NEXT: flat_store_byte v[0:1], v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off +; GFX803: flat_store_byte v[0:1], v2 +; GFX906: global_store_byte v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -87,8 +87,8 @@ ; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX803-NEXT: flat_store_byte v[0:1], v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off +; GFX803: flat_store_byte v[0:1], v2 +; GFX906: global_store_byte v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -110,7 +110,7 @@ ; GFX803: flat_store_short v[0:1], v2{{$}} ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off +; GFX906: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -134,7 +134,7 @@ ; GFX803: flat_store_short v[0:1], v{{[0-9]$}} ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off +; GFX906: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -157,7 +157,7 @@ ; GFX803: flat_store_byte v[0:1], v{{[0-9]$}} ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off +; GFX906: global_store_byte v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -181,7 +181,7 @@ ; GFX803: flat_store_byte v[0:1], v{{[0-9]$}} ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off +; GFX906: global_store_byte v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -201,7 +201,7 @@ ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; NO-D16-HI-NEXT: flat_store_short v[0:1], v2 +; NO-D16-HI: flat_store_short v[0:1], v2 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -219,7 +219,7 @@ ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; NO-D16-HI-NEXT: flat_store_short v[0:1], v2 +; NO-D16-HI: flat_store_short v[0:1], v2 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -237,7 +237,7 @@ ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; NO-D16-HI-NEXT: flat_store_short v[0:1], v2 +; NO-D16-HI: flat_store_short v[0:1], v2 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -255,7 +255,7 @@ ; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2 +; NO-D16-HI: flat_store_byte v[0:1], v2 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -274,7 +274,7 @@ ; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2 +; NO-D16-HI: flat_store_byte v[0:1], v2 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -291,7 +291,7 @@ ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:4094{{$}} ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: flat_store_short v[0:1], v2 offset:4094 +; GFX906: flat_store_short v[0:1], v2 offset:4094 ; GFX803-DAG: v_add_u32_e32 ; GFX803-DAG: v_addc_u32_e32 @@ -320,7 +320,7 @@ ; GFX906-DAG: v_lshrrev_b32_e32 ; GFX906: flat_store_short v[0:1], v2 offset:2050{{$}} -; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:2050{{$}} +; GFX900: flat_store_short_d16_hi v[0:1], v2 offset:2050{{$}} ; GFX803: flat_store_short v[0:1], v2{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -343,7 +343,7 @@ ; GFX803: flat_store_byte v[0:1], v2{{$}} ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: flat_store_byte v[0:1], v2 offset:4095{{$}} +; GFX906: flat_store_byte v[0:1], v2 offset:4095{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -366,7 +366,7 @@ ; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff000, v ; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v{{[0-9]+}}, vcc -; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:1{{$}} +; GFX900: flat_store_byte_d16_hi v[0:1], v2 offset:1{{$}} ; GFX906-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX906: flat_store_byte v[0:1], v2 offset:1{{$}} @@ -430,7 +430,7 @@ ; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s33 offen{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], s33 offen{{$}} +; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s33 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -448,7 +448,7 @@ ; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s33 offen{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s33 offen{{$}} +; NO-D16-HI: buffer_store_byte v1, v0, s[0:3], s33 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -467,7 +467,7 @@ ; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s33 offen{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s33 offen{{$}} +; NO-D16-HI: buffer_store_byte v1, v0, s[0:3], s33 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -484,7 +484,7 @@ ; GFX900: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 -; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s32 offset:4094{{$}} +; NO-D16-HI: buffer_store_short v0, off, s[0:3], s32 offset:4094{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -505,7 +505,7 @@ ; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s33{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s33{{$}} +; NO-D16-HI: buffer_store_short v0, off, s[0:3], s33{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -64,6 +64,8 @@ ; FIJI-NEXT: s_addc_u32 s1, s5, 0 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: s_nop 0 +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: flat_load_ubyte v0, v[0:1] ; FIJI-NEXT: v_mov_b32_e32 v1, s2 ; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -80,10 +82,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 diff --git a/llvm/test/CodeGen/AMDGPU/sub.i16.ll b/llvm/test/CodeGen/AMDGPU/sub.i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.i16.ll @@ -6,7 +6,7 @@ ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; VI-NEXT: buffer_store_short [[ADD]] +; VI: buffer_store_short [[ADD]] define amdgpu_kernel void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid @@ -23,7 +23,7 @@ ; GCN-LABEL: {{^}}v_test_sub_i16_constant: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffff85, [[A]] -; VI-NEXT: buffer_store_short [[ADD]] +; VI: buffer_store_short [[ADD]] define amdgpu_kernel void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid @@ -38,7 +38,7 @@ ; GCN-LABEL: {{^}}v_test_sub_i16_neg_constant: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x34d, [[A]] -; VI-NEXT: buffer_store_short [[ADD]] +; VI: buffer_store_short [[ADD]] define amdgpu_kernel void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid @@ -53,7 +53,7 @@ ; GCN-LABEL: {{^}}v_test_sub_i16_inline_63: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], 63, [[A]] -; VI-NEXT: buffer_store_short [[ADD]] +; VI: buffer_store_short [[ADD]] define amdgpu_kernel void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid @@ -69,7 +69,7 @@ ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; VI-NEXT: buffer_store_dword [[ADD]] +; VI: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid @@ -109,7 +109,7 @@ ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 -; VI-NEXT: buffer_store_dword [[SEXT]] +; VI: buffer_store_dword [[SEXT]] define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid @@ -128,9 +128,9 @@ ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 +; VI: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 ; VI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] -; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; VI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -7,7 +7,7 @@ ; GFX9-LABEL: v_test_sub_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -15,22 +15,26 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v0, v4, v5 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -38,17 +42,21 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v5, v[2:3] ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_e32 v2, v0, v1 -; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_sub_u16_e32 v0, v4, v5 +; VI-NEXT: v_sub_u16_sdwa v1, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -77,6 +85,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_pk_sub_i16 v0, s4, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -100,6 +110,8 @@ ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 @@ -116,7 +128,9 @@ ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 @@ -131,12 +145,14 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -144,18 +160,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x30 +; VI-NEXT: s_load_dword s3, s[0:1], 0x30 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_lshr_b32 s3, s0, 16 -; VI-NEXT: s_sub_i32 s1, s1, s3 -; VI-NEXT: s_sub_i32 s0, s2, s0 -; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: s_sub_i32 s0, s0, s1 +; VI-NEXT: s_sub_i32 s1, s2, s3 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %add = sub <2 x i16> %a, %b @@ -175,11 +193,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -194,13 +216,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_e32 v1, 0xffffff85, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -225,11 +251,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -244,13 +274,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_e32 v1, 0x34d, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -273,11 +307,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -292,13 +330,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_e32 v1, 1, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -321,11 +363,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -339,13 +385,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -370,11 +420,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -389,12 +443,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_sdwa v1, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -411,7 +469,7 @@ ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -419,24 +477,28 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v0, v4, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -444,16 +506,20 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v5, v[2:3] ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_e32 v0, v1, v2 -; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_sub_u16_e32 v0, v4, v5 +; VI-NEXT: v_sub_u16_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -473,7 +539,7 @@ ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -481,26 +547,30 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v1, v4, v5 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -508,18 +578,22 @@ ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_add_u32_e32 v4, vcc, s8, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v4, v[4:5] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dword v6, v[2:3] +; VI-NEXT: flat_load_dword v7, v[4:5] ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_e32 v0, v2, v4 -; VI-NEXT: v_sub_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_sub_u16_e32 v0, v6, v7 +; VI-NEXT: v_sub_u16_sdwa v2, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -539,7 +613,7 @@ ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -547,24 +621,28 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v0, v4, v5 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -572,18 +650,22 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v5, v[2:3] ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_sub_u16_e32 v0, v0, v1 +; VI-NEXT: v_sub_u16_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_sub_u16_e32 v0, v4, v5 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 -; VI-NEXT: v_bfe_i32 v1, v2, 0, 16 +; VI-NEXT: v_bfe_i32 v1, v1, 0, 16 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -603,7 +685,7 @@ ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -611,27 +693,31 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v1, v4, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -639,20 +725,24 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v5, v[2:3] ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_sub_u16_e32 v0, v0, v1 +; VI-NEXT: v_sub_u16_e32 v0, v4, v5 +; VI-NEXT: v_sub_u16_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_bfe_i32 v2, v1, 0, 16 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 -; VI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -104,18 +104,21 @@ ; VI-LABEL: truncate_high_elt_extract_vector: ; VI: ; %bb.0: ; %bb ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: s_load_dword s1, s[6:7], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s0, s0 ; VI-NEXT: s_sext_i32_i16 s1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mul_i32_i24_e32 v2, s1, v2 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll --- a/llvm/test/CodeGen/AMDGPU/umed3.ll +++ b/llvm/test/CodeGen/AMDGPU/umed3.ll @@ -672,10 +672,10 @@ ; SI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; FIXME: VI not matching med3 -; VI: v_min_u16 -; VI: v_max_u16 -; VI: v_min_u16 -; VI: v_max_u16 +; VI-DAG: v_min_u16 +; VI-DAG: v_max_u16 +; VI-DAG: v_min_u16 +; VI-DAG: v_max_u16 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -44,10 +44,14 @@ ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_madak_f16 v0, v0, v1, 0x4900 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm half addrspace(1)* %r, @@ -117,6 +121,8 @@ ; VI-NEXT: s_mov_b32 s15, s3 ; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 @@ -129,6 +135,8 @@ ; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mac_f16_e32 v2, v0, v3 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 ; VI-NEXT: buffer_store_short v2, off, s[0:3], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -18,11 +18,12 @@ ; GFX9-LABEL: shuffle_v4f16_234u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -74,11 +75,12 @@ ; GFX9-LABEL: shuffle_v4f16_3u6u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -90,11 +92,12 @@ ; GFX9-LABEL: shuffle_v4f16_3uu7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -106,15 +109,15 @@ ; GFX9-LABEL: shuffle_v4f16_35u5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -126,14 +129,14 @@ ; GFX9-LABEL: shuffle_v4f16_357u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -173,9 +176,12 @@ ; GFX9-LABEL: shuffle_v4f16_0145: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -187,9 +193,12 @@ ; GFX9-LABEL: shuffle_v4f16_0167: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -229,9 +238,12 @@ ; GFX9-LABEL: shuffle_v4f16_2345: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -243,9 +255,12 @@ ; GFX9-LABEL: shuffle_v4f16_2367: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -257,11 +272,12 @@ ; GFX9-LABEL: shuffle_v4f16_4501: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -273,11 +289,12 @@ ; GFX9-LABEL: shuffle_v4f16_4523: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -316,11 +333,12 @@ ; GFX9-LABEL: shuffle_v4f16_6701: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -332,11 +350,12 @@ ; GFX9-LABEL: shuffle_v4f16_6723: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -376,13 +395,14 @@ ; GFX9-LABEL: shuffle_v4f16_2356: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -394,13 +414,14 @@ ; GFX9-LABEL: shuffle_v4f16_5623: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -412,15 +433,15 @@ ; GFX9-LABEL: shuffle_v4f16_3456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -432,15 +453,15 @@ ; GFX9-LABEL: shuffle_v4f16_5634: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -452,16 +473,16 @@ ; GFX9-LABEL: shuffle_v4f16_5734: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -473,13 +494,14 @@ ; GFX9-LABEL: shuffle_v4i16_2356: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 @@ -491,9 +513,12 @@ ; GFX9-LABEL: shuffle_v4i16_0167: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 @@ -557,12 +582,12 @@ ; GFX9-LABEL: shuffle_v4f16_6161: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -635,9 +660,12 @@ ; GFX9-LABEL: shuffle_v8f16_4589: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:8 -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:8 +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -649,11 +677,12 @@ ; GFX9-LABEL: shuffle_v8f16_10_11_2_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -665,13 +694,14 @@ ; GFX9-LABEL: shuffle_v8f16_13_14_2_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -714,14 +744,18 @@ ; GFX9-LABEL: shuffle_v6f16_452367: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-NEXT: global_load_dword v3, v[3:4], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dwordx3 v[0:2], v[5:6], off +; GFX9-NEXT: global_load_dword v7, v[3:4], off ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0 %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1 @@ -733,7 +767,7 @@ ; GFX9-LABEL: fma_shuffle: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -742,17 +776,25 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s8, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_fma_f16 v6, v0, v2, v6 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v7 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v6 op_sel:[1,0,0] ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_endpgm entry: @@ -788,15 +830,15 @@ ; GFX9-LABEL: shuffle_v4f16_0456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v1, v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v7, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -809,14 +851,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v5, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -10,23 +10,26 @@ ; GCN-NEXT: BB0_1: ; %bb0 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_2 Depth 2 -; GCN-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 +; GCN-NEXT: v_add_co_u32_e64 v6, vcc_lo, v0, 8 ; GCN-NEXT: s_mov_b32 s5, exec_lo -; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GCN-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GCN-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] +; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: BB0_2: ; Parent Loop BB0_1 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s8, v4 -; GCN-NEXT: v_readfirstlane_b32 s9, v5 -; GCN-NEXT: v_readfirstlane_b32 s10, v2 -; GCN-NEXT: v_readfirstlane_b32 s11, v3 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5] -; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GCN-NEXT: v_readfirstlane_b32 s8, v2 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NEXT: v_readfirstlane_b32 s11, v5 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[2:3] +; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[4:5] ; GCN-NEXT: s_and_b32 s4, vcc_lo, s4 ; GCN-NEXT: s_and_saveexec_b32 s4, s4 ; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_store_dword v0, v0, s[8:11], 0 offen ; GCN-NEXT: v_nop ; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s4 diff --git a/llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir b/llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir --- a/llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: vmem_vcc_fallthrough # GCN: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -155,7 +155,7 @@ ; GCN: flat_load_dword ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX10: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: s_barrier +; GCN: s_barrier define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -182,7 +182,7 @@ ; GFX9_10: global_load_dword ; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX9_10: s_waitcnt vmcnt(0){{$}} -; GCN-NEXT: {{global|flat}}_store_dword +; GCN: {{global|flat}}_store_dword define amdgpu_kernel void @load_vmcnt_global(i32 addrspace(1)* %arg) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -201,7 +201,7 @@ ; GCN: flat_load_dword ; GCN-NOT: vscnt ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: {{global|flat}}_store_dword +; GCN: {{global|flat}}_store_dword define amdgpu_kernel void @load_vmcnt_flat(i32* %arg) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1058,7 +1058,7 @@ ; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: v_nop ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]] @@ -1081,7 +1081,7 @@ ; GCN: v_readlane_b32 s34, v32, 2 ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: v_nop ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]] diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -30,6 +30,8 @@ ; VI-NEXT: s_addk_i32 s0, 0x3e7 ; VI-NEXT: s_or_b32 s0, s0, 4 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %load = load i16, i16 addrspace(4)* %arg, align 4 @@ -69,6 +71,8 @@ ; VI-NEXT: s_addk_i32 s0, 0x3e7 ; VI-NEXT: s_or_b32 s0, s0, 4 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %load = load i16, i16 addrspace(4)* %arg, align 4 @@ -109,6 +113,8 @@ ; VI-NEXT: s_addk_i32 s0, 0x3e7 ; VI-NEXT: s_or_b32 s0, s0, 4 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %load = load i16, i16 addrspace(4)* %arg, align 4 @@ -156,6 +162,8 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_bfe_u32 s0, s0, 0x10010 ; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_short v[0:1], v4 ; VI-NEXT: flat_store_byte v[2:3], v5 ; VI-NEXT: s_endpgm @@ -192,6 +200,8 @@ ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f16_e64 v2, s0, 4.0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %load = load half, half addrspace(4)* %arg, align 4 @@ -241,6 +251,8 @@ ; VI-NEXT: v_or_b32_e32 v2, s0, v0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %load = load <2 x i8>, <2 x i8> addrspace(4)* %arg, align 4 @@ -279,10 +291,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_load_ushort v2, v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, 0x3e7, v2 ; VI-NEXT: v_or_b32_e32 v2, 4, v2 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -321,6 +337,8 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s0, s0, 1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm %load = load i1, i1 addrspace(4)* %arg, align 4 @@ -359,6 +377,8 @@ ; VI-NEXT: s_addk_i32 s0, 0x3e7 ; VI-NEXT: s_or_b32 s0, s0, 4 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %load = load i16, i16 addrspace(4)* %arg, align 4 @@ -401,6 +421,8 @@ ; VI-NEXT: s_addc_u32 s1, 0, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm %load = load i1, i1 addrspace(4)* %arg, align 4 @@ -440,6 +462,8 @@ ; VI-NEXT: s_addk_i32 s0, 0x3e7 ; VI-NEXT: s_or_b32 s0, s0, 4 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %load = load i16, i16 addrspace(6)* %arg, align 4 @@ -477,6 +501,8 @@ ; VI-NEXT: s_addk_i32 s0, 0x3e7 ; VI-NEXT: s_or_b32 s0, s0, 1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %load = load i16, i16 addrspace(1)* %arg, align 4, !invariant.load !0 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -1,5 +1,5 @@ -; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-O0 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-O3 %s +; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -mattr=-xnack -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-O0 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-O3 %s define amdgpu_cs void @no_cfg(<4 x i32> inreg %tmp14) { %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -204,7 +204,7 @@ ; FUNC-LABEL: {{^}}scalar_xor_inline_imm_i64: ; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} ; SI-NOT: xor_b32 -; SI: s_xor_b32 s[[VAL_LO]], s{{[0-9]+}}, 63 +; SI: s_xor_b32 s{{[0-9]+}}, s[[VAL_LO]], 63 ; SI-NOT: xor_b32 ; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s{{[0-9]+}} ; SI-NOT: xor_b32 diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll --- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll @@ -54,7 +54,7 @@ ; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} ; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], [[MASK]] ; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], [[MASK]] -; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]] +; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[MASK_B]] ; GCN: v_cmp_eq_u32_e32 vcc, [[MASK_A]], [[V_B]] ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc diff --git a/llvm/test/MC/AMDGPU/hsa-sgpr-init-bug-v3.s b/llvm/test/MC/AMDGPU/hsa-sgpr-init-bug-v3.s --- a/llvm/test/MC/AMDGPU/hsa-sgpr-init-bug-v3.s +++ b/llvm/test/MC/AMDGPU/hsa-sgpr-init-bug-v3.s @@ -15,7 +15,7 @@ .text -.amdgcn_target "amdgcn-amd-amdhsa--gfx802" +.amdgcn_target "amdgcn-amd-amdhsa--gfx802+xnack" .p2align 8 minimal: diff --git a/llvm/test/MC/AMDGPU/isa-version-hsa.s b/llvm/test/MC/AMDGPU/isa-version-hsa.s --- a/llvm/test/MC/AMDGPU/isa-version-hsa.s +++ b/llvm/test/MC/AMDGPU/isa-version-hsa.s @@ -6,8 +6,8 @@ // RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s // RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s -// OSABI-HSA: .amd_amdgpu_isa "amdgcn-amd-amdhsa--gfx802" +// OSABI-HSA: .amd_amdgpu_isa "amdgcn-amd-amdhsa--gfx802+xnack" // OSABI-UNK-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line // OSABI-HSA-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line // OSABI-PAL-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line -.amd_amdgpu_isa "amdgcn-amd-amdhsa--gfx802" +.amd_amdgpu_isa "amdgcn-amd-amdhsa--gfx802+xnack" diff --git a/llvm/test/MC/AMDGPU/isa-version-pal.s b/llvm/test/MC/AMDGPU/isa-version-pal.s --- a/llvm/test/MC/AMDGPU/isa-version-pal.s +++ b/llvm/test/MC/AMDGPU/isa-version-pal.s @@ -6,8 +6,8 @@ // RUN: llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL %s // RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s -// OSABI-PAL: .amd_amdgpu_isa "amdgcn-amd-amdpal--gfx802" +// OSABI-PAL: .amd_amdgpu_isa "amdgcn-amd-amdpal--gfx802+xnack" // OSABI-UNK-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line // OSABI-HSA-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line // OSABI-PAL-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line -.amd_amdgpu_isa "amdgcn-amd-amdpal--gfx802" +.amd_amdgpu_isa "amdgcn-amd-amdpal--gfx802+xnack" diff --git a/llvm/test/MC/AMDGPU/isa-version-unk.s b/llvm/test/MC/AMDGPU/isa-version-unk.s --- a/llvm/test/MC/AMDGPU/isa-version-unk.s +++ b/llvm/test/MC/AMDGPU/isa-version-unk.s @@ -6,8 +6,8 @@ // RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s // RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s -// OSABI-UNK: .amd_amdgpu_isa "amdgcn-amd-unknown--gfx802" +// OSABI-UNK: .amd_amdgpu_isa "amdgcn-amd-unknown--gfx802+xnack" // OSABI-UNK-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line // OSABI-HSA-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line // OSABI-PAL-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line -.amd_amdgpu_isa "amdgcn-amd-unknown--gfx802" +.amd_amdgpu_isa "amdgcn-amd-unknown--gfx802+xnack" diff --git a/llvm/test/MC/AMDGPU/reg-syntax-err.s b/llvm/test/MC/AMDGPU/reg-syntax-err.s --- a/llvm/test/MC/AMDGPU/reg-syntax-err.s +++ b/llvm/test/MC/AMDGPU/reg-syntax-err.s @@ -1,4 +1,4 @@ -// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefix=NOVI %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -mattr=-xnack %s 2>&1 | FileCheck -check-prefix=NOVI %s s_mov_b32 s1, s 1 // NOVI: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/xnack-mask.s b/llvm/test/MC/AMDGPU/xnack-mask.s --- a/llvm/test/MC/AMDGPU/xnack-mask.s +++ b/llvm/test/MC/AMDGPU/xnack-mask.s @@ -1,7 +1,7 @@ // RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti -show-encoding %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 %s // RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii -show-encoding %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1001 -show-encoding %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -mattr=-xnack -show-encoding %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1001 -mattr=-xnack -show-encoding %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 %s // RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -show-encoding %s 2>&1 | FileCheck -check-prefix=XNACKERR %s // RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -show-encoding %s | FileCheck -check-prefix=XNACK %s