diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -766,7 +766,6 @@ FeatureFastFMAF32, HalfRate64Ops, FeatureLDSBankCount32, - FeatureXNACK, FeatureUnpackedD16VMem, FeatureCodeObjectV3]>; @@ -786,7 +785,6 @@ def FeatureISAVersion8_1_0 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount16, - FeatureXNACK, FeatureCodeObjectV3]>; def FeatureISAVersion9_0_0 : FeatureSet< @@ -800,7 +798,6 @@ [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureXNACK, FeatureDoesNotSupportSRAMECC, FeatureCodeObjectV3]>; @@ -836,8 +833,6 @@ FeatureMAIInsts, FeaturePkFmacF16Inst, FeatureAtomicFaddInsts, - FeatureXNACK, - FeatureSRAMECC, FeatureMFMAInlineLiteralBug, FeatureCodeObjectV3]>; @@ -845,7 +840,6 @@ [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureXNACK, FeatureCodeObjectV3]>; // TODO: Organize more features into groups. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -722,9 +722,9 @@ } bool isXNACKEnabled() const { - // FIXME: XNACK should be enabled with "Any" as well as "On". We - // can then remove this function and start using getXnackSetting directly. - return getXnackSetting() == AMDGPU::IsaInfo::TargetIDSetting::On; + auto TIDS = getXnackSetting(); + return TIDS == AMDGPU::IsaInfo::TargetIDSetting::Any || + TIDS == AMDGPU::IsaInfo::TargetIDSetting::On; } bool supportAnyXnackSetting() const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -915,18 +915,9 @@ errs() << "warning: SramEcc Off was requested for a processor that does " "not support it!\n"; - // FIXME: These hacks are necessary to support backwards compatibility with - // the old defaults for xnack. When the new targetid feature is enabled this, - // along with the change in isXNACKEnabled can be updated to reflect the true - // intended meaning of "default" for these settings. - if (EnableXNACK) - SupportAnyXnackSetting = false; if (GPU == "generic" || GPU == "generic-hsa") { SupportAnySramEccSetting = true; - - // FIXME - SupportAnyXnackSetting = false; - EnableXNACK = true; + SupportAnyXnackSetting = true; } LLVM_DEBUG(dbgs() << "XNACK setting for subtarget: " << getXnackSetting() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -521,11 +521,11 @@ ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] @@ -611,12 +611,12 @@ ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 @@ -625,20 +625,20 @@ ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: flat_load_ubyte v1, v[2:3] -; VI-NEXT: flat_load_ubyte v2, v[4:5] -; VI-NEXT: flat_load_ubyte v3, v[6:7] +; VI-NEXT: flat_load_ubyte v11, v[0:1] +; VI-NEXT: flat_load_ubyte v10, v[2:3] +; VI-NEXT: flat_load_ubyte v9, v[4:5] +; VI-NEXT: flat_load_ubyte v8, v[6:7] ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -706,12 +706,12 @@ ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] @@ -754,12 +754,12 @@ ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] @@ -802,11 +802,11 @@ ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] @@ -860,12 +860,12 @@ ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 @@ -874,20 +874,20 @@ ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: flat_load_ubyte v1, v[2:3] -; VI-NEXT: flat_load_ubyte v2, v[4:5] -; VI-NEXT: flat_load_ubyte v3, v[6:7] +; VI-NEXT: flat_load_ubyte v11, v[0:1] +; VI-NEXT: flat_load_ubyte v10, v[2:3] +; VI-NEXT: flat_load_ubyte v9, v[4:5] +; VI-NEXT: flat_load_ubyte v8, v[6:7] ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -921,12 +921,12 @@ ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] @@ -968,12 +968,12 @@ ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] @@ -1017,12 +1017,12 @@ ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] @@ -1065,12 +1065,12 @@ ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -131,6 +131,7 @@ ; CHECK-NEXT: s_add_u32 s4, s4, external_constant@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, external_constant@gotpcrel32@hi+4 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -146,6 +147,7 @@ ; CHECK-NEXT: s_add_u32 s6, s6, const.ptr@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s7, s7, const.ptr@gotpcrel32@hi+4 ; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -8,280 +8,418 @@ ; GCN-LABEL: v_extract_v64i32_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v0 ; GCN-NEXT: s_add_u32 s4, s32, 0x3fc0 ; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_add_co_u32_e32 v52, vcc, 64, v0 ; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_and_b32 s33, s4, 0xffffc000 ; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v15 -; GCN-NEXT: v_mov_b32_e32 v11, s4 -; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v16, vcc -; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v15, v11 -; GCN-NEXT: v_addc_co_u32_e32 v49, vcc, v16, v12, vcc +; GCN-NEXT: v_mov_b32_e32 v13, s5 +; GCN-NEXT: v_mov_b32_e32 v12, s4 +; GCN-NEXT: v_addc_co_u32_e32 v53, vcc, 0, v1, vcc +; GCN-NEXT: v_add_co_u32_e32 v54, vcc, v0, v12 +; GCN-NEXT: v_addc_co_u32_e32 v55, vcc, v1, v13, vcc ; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_mov_b32_e32 v11, s4 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v15, v11 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[3:6], v[15:16], off -; GCN-NEXT: global_load_dwordx4 v[7:10], v[15:16], off offset:16 -; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v16, v12, vcc -; GCN-NEXT: global_load_dwordx4 v[11:14], v[15:16], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[15:18], v[15:16], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[19:22], v[31:32], off -; GCN-NEXT: global_load_dwordx4 v[23:26], v[31:32], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[27:30], v[31:32], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[31:34], v[31:32], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[35:38], v[48:49], off -; GCN-NEXT: global_load_dwordx4 v[39:42], v[48:49], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[43:46], v[48:49], off offset:32 -; GCN-NEXT: v_lshrrev_b32_e64 v0, 6, s33 -; GCN-NEXT: v_add_u32_e32 v0, 0x100, v0 -; GCN-NEXT: v_add_u32_e32 v1, 16, v0 -; GCN-NEXT: v_add_u32_e32 v2, 20, v0 -; GCN-NEXT: s_add_u32 s32, s32, 0x10000 -; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[43:46], v[59:60], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[51:54], v[59:60], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[55:58], v[59:60], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[59:62], v[59:60], off offset:48 +; GCN-NEXT: v_mov_b32_e32 v13, s5 +; GCN-NEXT: v_mov_b32_e32 v12, s4 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_add_co_u32_e32 v56, vcc, v0, v12 +; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GCN-NEXT: v_addc_co_u32_e32 v57, vcc, v1, v13, vcc +; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[40:43], v[52:53], off +; GCN-NEXT: global_load_dwordx4 v[36:39], v[52:53], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[32:35], v[52:53], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[16:19], v[52:53], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[44:47], v[54:55], off +; GCN-NEXT: global_load_dwordx4 v[48:51], v[54:55], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[54:55], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[54:55], off offset:48 +; GCN-NEXT: v_lshrrev_b32_e64 v3, 6, s33 +; GCN-NEXT: v_add_u32_e32 v3, 0x100, v3 +; GCN-NEXT: v_add_u32_e32 v60, 16, v3 +; GCN-NEXT: v_add_u32_e32 v61, 20, v3 +; GCN-NEXT: v_add_u32_e32 v0, 24, v3 +; GCN-NEXT: v_add_u32_e32 v1, 28, v3 +; GCN-NEXT: s_add_u32 s32, s32, 0x14000 +; GCN-NEXT: s_sub_u32 s32, s32, 0x14000 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:704 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:708 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:712 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:716 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:720 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:724 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:728 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:732 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:736 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:740 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:744 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:748 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:752 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:756 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:760 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:764 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(16) +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[28:31], v[56:57], off +; GCN-NEXT: s_nop 0 +; GCN-NEXT: global_load_dwordx4 v[44:47], v[56:57], off offset:16 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(16) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[52:55], v[56:57], off offset:32 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: global_load_dwordx4 v[56:59], v[56:57], off offset:48 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_store_dword v4, v60, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v61, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 24, v0 -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 28, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 32, v0 -; GCN-NEXT: v_add_u32_e32 v2, 36, v0 -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 40, v0 -; GCN-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 44, v0 +; GCN-NEXT: v_add_u32_e32 v0, 32, v3 +; GCN-NEXT: v_add_u32_e32 v1, 36, v3 +; GCN-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 40, v3 +; GCN-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 44, v3 +; GCN-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 48, v3 +; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 52, v3 +; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 56, v3 ; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 48, v0 -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 52, v0 +; GCN-NEXT: v_add_u32_e32 v1, 60, v3 +; GCN-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 56, v0 -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 60, v0 +; GCN-NEXT: v_add_u32_e32 v0, 64, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0x44, v3 +; GCN-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x48, v3 +; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x4c, v3 +; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x50, v3 +; GCN-NEXT: buffer_store_dword v43, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x54, v3 +; GCN-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x58, v3 +; GCN-NEXT: buffer_store_dword v37, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x5c, v3 +; GCN-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x60, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0x64, v3 +; GCN-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x68, v3 +; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x6c, v3 +; GCN-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x70, v3 +; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x74, v3 +; GCN-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x78, v3 ; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 64, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x44, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0x7c, v3 +; GCN-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x48, v0 -; GCN-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x4c, v0 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s33 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s33 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s33 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v0, 0x80, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0x84, v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v4, v24 +; GCN-NEXT: v_mov_b32_e32 v5, v25 +; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v6, v26 +; GCN-NEXT: v_add_u32_e32 v0, 0x88, v3 +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x90, v3 +; GCN-NEXT: v_mov_b32_e32 v7, v27 +; GCN-NEXT: v_add_u32_e32 v1, 0x8c, v3 +; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x94, v3 +; GCN-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x98, v3 +; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x9c, v3 +; GCN-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v0, 0xa0, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0xa4, v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v12, v32 +; GCN-NEXT: v_mov_b32_e32 v13, v33 +; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v14, v34 +; GCN-NEXT: v_add_u32_e32 v0, 0xa8, v3 +; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xb0, v3 +; GCN-NEXT: v_mov_b32_e32 v15, v35 +; GCN-NEXT: v_add_u32_e32 v1, 0xac, v3 +; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xb4, v3 +; GCN-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xb8, v3 ; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x50, v0 -; GCN-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x54, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0xbc, v3 +; GCN-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x58, v0 -; GCN-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x5c, v0 -; GCN-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x60, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x64, v0 -; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x68, v0 -; GCN-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x6c, v0 -; GCN-NEXT: buffer_store_dword v29, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x70, v0 -; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x74, v0 -; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x78, v0 -; GCN-NEXT: buffer_store_dword v32, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x80, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x84, v0 -; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x88, v0 -; GCN-NEXT: buffer_store_dword v36, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x8c, v0 -; GCN-NEXT: buffer_store_dword v37, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x90, v0 -; GCN-NEXT: buffer_store_dword v38, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x94, v0 -; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x98, v0 -; GCN-NEXT: buffer_store_dword v40, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x9c, v0 -; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v1, 0xa0, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0xa4, v0 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v1, 0xc4, v3 +; GCN-NEXT: v_add_u32_e32 v0, 0xc0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v15 -; GCN-NEXT: v_mov_b32_e32 v9, v16 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v10, v17 -; GCN-NEXT: v_add_u32_e32 v1, 0xa8, v0 -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 -; GCN-NEXT: v_mov_b32_e32 v11, v18 -; GCN-NEXT: v_add_u32_e32 v2, 0xac, v0 -; GCN-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xb4, v0 -; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb8, v0 -; GCN-NEXT: buffer_store_dword v48, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xbc, v0 -; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v2, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v4, v12 +; GCN-NEXT: v_mov_b32_e32 v5, v13 +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v7, v15 +; GCN-NEXT: v_add_u32_e32 v1, 0xcc, v3 +; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 4, v3 +; GCN-NEXT: v_mov_b32_e32 v6, v14 +; GCN-NEXT: v_add_u32_e32 v0, 0xc8, v3 +; GCN-NEXT: v_add_u32_e32 v4, 8, v3 +; GCN-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 12, v3 +; GCN-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v2, 0xc4, v0 -; GCN-NEXT: v_add_u32_e32 v1, 0xc0, v0 +; GCN-NEXT: v_add_u32_e32 v0, 0xd0, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0xd4, v3 +; GCN-NEXT: v_add_u32_e32 v4, 0xd8, v3 +; GCN-NEXT: v_add_u32_e32 v5, 0xdc, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xcc, v0 +; GCN-NEXT: v_mov_b32_e32 v6, v10 +; GCN-NEXT: v_mov_b32_e32 v7, v11 +; GCN-NEXT: v_mov_b32_e32 v8, v12 +; GCN-NEXT: v_mov_b32_e32 v9, v13 +; GCN-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xc8, v0 -; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 4, v0 -; GCN-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 -; GCN-NEXT: v_add_u32_e32 v7, 8, v0 -; GCN-NEXT: v_add_u32_e32 v2, 12, v0 -; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 -; GCN-NEXT: v_add_u32_e32 v2, 0xd4, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xd8, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xdc, v0 -; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v5, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xe0, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0xe4, v3 +; GCN-NEXT: v_add_u32_e32 v4, 0xe8, v3 +; GCN-NEXT: v_add_u32_e32 v5, 0xec, v3 +; GCN-NEXT: v_add_u32_e32 v6, 0xf0, v3 +; GCN-NEXT: v_add_u32_e32 v7, 0xf4, v3 +; GCN-NEXT: v_add_u32_e32 v8, 0xf8, v3 +; GCN-NEXT: v_add_u32_e32 v9, 0xfc, v3 +; GCN-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v53, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v54, v4, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xe0, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0xe4, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xe8, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xec, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xf0, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xf4, v0 -; GCN-NEXT: v_add_u32_e32 v7, 0xf8, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0xfc, v0 -; GCN-NEXT: buffer_store_dword v55, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 63, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GCN-NEXT: v_add_u32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v55, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v56, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v58, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v59, v9, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v0, 63, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_add_u32_e32 v0, v3, v0 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -294,285 +432,468 @@ ; GCN-LABEL: v_extract_v128i16_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v0 ; GCN-NEXT: s_add_u32 s4, s32, 0x3fc0 -; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_and_b32 s33, s4, 0xffffc000 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_add_co_u32_e32 v52, vcc, 64, v0 ; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v15 -; GCN-NEXT: v_mov_b32_e32 v11, s4 -; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v16, vcc -; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v15, v11 -; GCN-NEXT: v_addc_co_u32_e32 v49, vcc, v16, v12, vcc +; GCN-NEXT: v_addc_co_u32_e32 v53, vcc, 0, v1, vcc +; GCN-NEXT: v_lshrrev_b32_e64 v3, 6, s33 +; GCN-NEXT: v_add_u32_e32 v3, 0x100, v3 +; GCN-NEXT: v_add_u32_e32 v60, 16, v3 +; GCN-NEXT: v_add_u32_e32 v61, 20, v3 +; GCN-NEXT: s_add_u32 s32, s32, 0x14000 +; GCN-NEXT: s_sub_u32 s32, s32, 0x14000 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:768 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:772 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:776 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:780 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:784 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:788 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:792 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:796 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:800 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:804 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:808 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:812 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:816 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:820 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:824 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:828 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v13, s5 +; GCN-NEXT: v_mov_b32_e32 v12, s4 +; GCN-NEXT: v_add_co_u32_e32 v54, vcc, v0, v12 +; GCN-NEXT: v_addc_co_u32_e32 v55, vcc, v1, v13, vcc ; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_mov_b32_e32 v11, s4 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v15, v11 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[3:6], v[15:16], off -; GCN-NEXT: global_load_dwordx4 v[7:10], v[15:16], off offset:16 -; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v16, v12, vcc -; GCN-NEXT: global_load_dwordx4 v[11:14], v[15:16], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[15:18], v[15:16], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[19:22], v[31:32], off -; GCN-NEXT: global_load_dwordx4 v[23:26], v[31:32], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[27:30], v[31:32], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[31:34], v[31:32], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[35:38], v[48:49], off -; GCN-NEXT: global_load_dwordx4 v[39:42], v[48:49], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[43:46], v[48:49], off offset:32 -; GCN-NEXT: v_lshrrev_b32_e64 v0, 6, s33 -; GCN-NEXT: v_add_u32_e32 v0, 0x100, v0 -; GCN-NEXT: v_add_u32_e32 v1, 16, v0 -; GCN-NEXT: v_add_u32_e32 v2, 20, v0 -; GCN-NEXT: s_add_u32 s32, s32, 0x10000 -; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[43:46], v[59:60], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[51:54], v[59:60], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[55:58], v[59:60], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[59:62], v[59:60], off offset:48 +; GCN-NEXT: v_mov_b32_e32 v13, s5 +; GCN-NEXT: v_mov_b32_e32 v12, s4 +; GCN-NEXT: v_add_co_u32_e32 v56, vcc, v0, v12 +; GCN-NEXT: v_addc_co_u32_e32 v57, vcc, v1, v13, vcc +; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[44:47], v[52:53], off +; GCN-NEXT: global_load_dwordx4 v[40:43], v[52:53], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[16:19], v[52:53], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v[52:53], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[36:39], v[54:55], off +; GCN-NEXT: global_load_dwordx4 v[32:35], v[54:55], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[54:55], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[24:27], v[54:55], off offset:48 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:704 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:708 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:712 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:716 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:720 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:724 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:728 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:732 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:736 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:740 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:744 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:748 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:752 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:756 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:760 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:764 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(16) +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[24:27], v[56:57], off +; GCN-NEXT: s_nop 0 +; GCN-NEXT: global_load_dwordx4 v[8:11], v[56:57], off offset:16 +; GCN-NEXT: v_add_u32_e32 v0, 24, v3 +; GCN-NEXT: v_add_u32_e32 v1, 28, v3 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(16) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[52:55], v[56:57], off offset:32 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: global_load_dwordx4 v[56:59], v[56:57], off offset:48 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_store_dword v4, v60, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v61, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 24, v0 -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 28, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 32, v0 -; GCN-NEXT: v_add_u32_e32 v2, 36, v0 -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 40, v0 -; GCN-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 44, v0 -; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 48, v0 -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 52, v0 -; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 56, v0 -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 60, v0 +; GCN-NEXT: v_add_u32_e32 v0, 32, v3 +; GCN-NEXT: v_add_u32_e32 v1, 36, v3 +; GCN-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 40, v3 +; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 44, v3 +; GCN-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 48, v3 +; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 52, v3 +; GCN-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 56, v3 +; GCN-NEXT: v_add_u32_e32 v1, 60, v3 +; GCN-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 64, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0x44, v3 +; GCN-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x48, v3 +; GCN-NEXT: buffer_store_dword v45, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x4c, v3 +; GCN-NEXT: buffer_store_dword v46, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x50, v3 +; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x54, v3 +; GCN-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x58, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0x5c, v3 +; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v43, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x60, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0x64, v3 +; GCN-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x68, v3 ; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 64, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x44, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0x6c, v3 +; GCN-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x48, v0 -; GCN-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x4c, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v0, 0x70, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0x74, v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x78, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0x7c, v3 +; GCN-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x50, v0 -; GCN-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x54, v0 -; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x58, v0 -; GCN-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x5c, v0 -; GCN-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x60, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x64, v0 -; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x68, v0 -; GCN-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x6c, v0 -; GCN-NEXT: buffer_store_dword v29, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x70, v0 -; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x74, v0 -; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x78, v0 -; GCN-NEXT: buffer_store_dword v32, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x80, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x84, v0 -; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x88, v0 -; GCN-NEXT: buffer_store_dword v36, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x8c, v0 +; GCN-NEXT: v_add_u32_e32 v0, 0x80, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0x84, v3 +; GCN-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x88, v3 ; GCN-NEXT: buffer_store_dword v37, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x90, v0 -; GCN-NEXT: buffer_store_dword v38, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x94, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0x8c, v3 +; GCN-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x90, v3 ; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x98, v0 -; GCN-NEXT: buffer_store_dword v40, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x9c, v0 -; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v1, 0xa0, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0xa4, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0x94, v3 +; GCN-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x98, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0x9c, v3 +; GCN-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xa0, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0xa4, v3 +; GCN-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xa8, v3 +; GCN-NEXT: buffer_store_dword v29, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xac, v3 +; GCN-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v0, 0xb0, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0xb4, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v15 -; GCN-NEXT: v_mov_b32_e32 v9, v16 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v10, v17 -; GCN-NEXT: v_add_u32_e32 v1, 0xa8, v0 -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 -; GCN-NEXT: v_mov_b32_e32 v11, v18 -; GCN-NEXT: v_add_u32_e32 v2, 0xac, v0 -; GCN-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xb4, v0 -; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb8, v0 -; GCN-NEXT: buffer_store_dword v48, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xbc, v0 -; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v2, 0xc4, v0 -; GCN-NEXT: v_add_u32_e32 v1, 0xc0, v0 +; GCN-NEXT: v_mov_b32_e32 v18, v30 +; GCN-NEXT: v_mov_b32_e32 v19, v31 +; GCN-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v20, v32 +; GCN-NEXT: v_add_u32_e32 v0, 0xb8, v3 +; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v21, v33 +; GCN-NEXT: v_add_u32_e32 v1, 0xbc, v3 +; GCN-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v0, 0xc0, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0xc4, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xcc, v0 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_add_u32_e32 v0, 0xc8, v3 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_add_u32_e32 v1, 0xcc, v3 +; GCN-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xc8, v0 -; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 4, v0 -; GCN-NEXT: v_add_u32_e32 v7, 8, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 12, v0 -; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xd4, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xd8, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xdc, v0 -; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:784 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:788 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:792 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:796 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:800 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:808 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:812 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:816 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:820 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:824 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v1, 4, v3 +; GCN-NEXT: v_add_u32_e32 v4, 8, v3 +; GCN-NEXT: v_add_u32_e32 v0, 0xd0, v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 12, v3 +; GCN-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v2 +; GCN-NEXT: v_add_u32_e32 v4, 0xd4, v3 +; GCN-NEXT: v_add_u32_e32 v5, 0xd8, v3 +; GCN-NEXT: v_add_u32_e32 v6, 0xdc, v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v7, v11 +; GCN-NEXT: v_mov_b32_e32 v8, v12 +; GCN-NEXT: v_mov_b32_e32 v9, v13 +; GCN-NEXT: v_mov_b32_e32 v10, v14 +; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v6, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xe0, v3 +; GCN-NEXT: v_add_u32_e32 v4, 0xe4, v3 +; GCN-NEXT: v_add_u32_e32 v5, 0xe8, v3 +; GCN-NEXT: v_add_u32_e32 v6, 0xec, v3 +; GCN-NEXT: v_add_u32_e32 v7, 0xf0, v3 +; GCN-NEXT: v_add_u32_e32 v8, 0xf4, v3 +; GCN-NEXT: v_add_u32_e32 v9, 0xf8, v3 +; GCN-NEXT: v_add_u32_e32 v10, 0xfc, v3 +; GCN-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v53, v4, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v54, v5, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xe0, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xe4, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xe8, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xec, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xf0, v0 -; GCN-NEXT: v_add_u32_e32 v7, 0xf4, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0xf8, v0 -; GCN-NEXT: v_add_u32_e32 v9, 0xfc, v0 -; GCN-NEXT: buffer_store_dword v55, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v9, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 1, v10 -; GCN-NEXT: v_and_b32_e32 v1, 63, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GCN-NEXT: v_add_u32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v55, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v56, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v58, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v59, v10, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v0, 63, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_add_u32_e32 v0, v3, v0 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: v_and_b32_e32 v1, 1, v10 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: v_and_b32_e32 v1, 1, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GCN-NEXT: s_mov_b32 s33, s6 -; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: s_waitcnt vmcnt(14) ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -586,8 +907,19 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s32, 0x3fc0 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_add_co_u32_e32 v52, vcc, 64, v0 ; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_and_b32 s33, s4, 0xffffc000 +; GCN-NEXT: s_movk_i32 s4, 0x80 +; GCN-NEXT: v_mov_b32_e32 v13, s5 +; GCN-NEXT: v_mov_b32_e32 v12, s4 +; GCN-NEXT: v_addc_co_u32_e32 v53, vcc, 0, v1, vcc +; GCN-NEXT: v_add_co_u32_e32 v54, vcc, v0, v12 +; GCN-NEXT: v_addc_co_u32_e32 v55, vcc, v1, v13, vcc +; GCN-NEXT: s_movk_i32 s4, 0xc0 +; GCN-NEXT: v_mov_b32_e32 v13, s5 +; GCN-NEXT: v_mov_b32_e32 v12, s4 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill @@ -603,249 +935,381 @@ ; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v15, v0 -; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[0:3], v[15:16], off -; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v15 -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v16, vcc -; GCN-NEXT: s_add_u32 s32, s32, 0x10000 -; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_mov_b32_e32 v11, s4 -; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v15, v11 -; GCN-NEXT: v_addc_co_u32_e32 v49, vcc, v16, v12, vcc -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_mov_b32_e32 v11, s4 -; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v15, v11 -; GCN-NEXT: global_load_dwordx4 v[7:10], v[15:16], off offset:16 -; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v16, v12, vcc -; GCN-NEXT: global_load_dwordx4 v[11:14], v[15:16], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[15:18], v[15:16], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[19:22], v[31:32], off -; GCN-NEXT: global_load_dwordx4 v[23:26], v[31:32], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[27:30], v[31:32], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[31:34], v[31:32], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[35:38], v[48:49], off -; GCN-NEXT: global_load_dwordx4 v[39:42], v[48:49], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[43:46], v[48:49], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[3:6], v[59:60], off -; GCN-NEXT: v_lshrrev_b32_e64 v0, 6, s33 -; GCN-NEXT: v_add_u32_e32 v0, 0x100, v0 -; GCN-NEXT: v_add_u32_e32 v1, 16, v0 -; GCN-NEXT: v_add_u32_e32 v2, 24, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[51:54], v[59:60], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[55:58], v[59:60], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[59:62], v[59:60], off offset:48 -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 20, v0 -; GCN-NEXT: v_add_u32_e32 v1, 44, v0 -; GCN-NEXT: v_add_u32_e32 v7, 28, v0 -; GCN-NEXT: v_add_u32_e32 v9, 36, v0 -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen +; GCN-NEXT: v_add_co_u32_e32 v56, vcc, v0, v12 +; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GCN-NEXT: v_addc_co_u32_e32 v57, vcc, v1, v13, vcc +; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[40:43], v[52:53], off +; GCN-NEXT: global_load_dwordx4 v[36:39], v[52:53], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[32:35], v[52:53], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[16:19], v[52:53], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[44:47], v[54:55], off +; GCN-NEXT: global_load_dwordx4 v[48:51], v[54:55], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[54:55], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[54:55], off offset:48 +; GCN-NEXT: v_lshrrev_b32_e64 v3, 6, s33 +; GCN-NEXT: v_add_u32_e32 v3, 0x100, v3 +; GCN-NEXT: v_add_u32_e32 v60, 16, v3 +; GCN-NEXT: v_add_u32_e32 v61, 24, v3 +; GCN-NEXT: v_add_u32_e32 v0, 32, v3 +; GCN-NEXT: v_add_u32_e32 v1, 20, v3 +; GCN-NEXT: v_add_u32_e32 v62, 60, v3 +; GCN-NEXT: s_add_u32 s32, s32, 0x14000 +; GCN-NEXT: s_sub_u32 s32, s32, 0x14000 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:704 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:708 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:712 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:716 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:720 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:724 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:728 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:732 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:736 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:740 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:744 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:748 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:752 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:756 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:760 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:764 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(16) +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[28:31], v[56:57], off +; GCN-NEXT: s_nop 0 +; GCN-NEXT: global_load_dwordx4 v[44:47], v[56:57], off offset:16 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(16) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[52:55], v[56:57], off offset:32 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: global_load_dwordx4 v[56:59], v[56:57], off offset:48 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_store_dword v4, v60, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v61, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v4, 28, v3 +; GCN-NEXT: v_add_u32_e32 v6, 36, v3 +; GCN-NEXT: v_add_u32_e32 v60, 44, v3 +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v60, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 40, v3 +; GCN-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 48, v3 +; GCN-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v61, 52, v3 +; GCN-NEXT: v_add_u32_e32 v1, 56, v3 +; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 40, v0 -; GCN-NEXT: v_add_u32_e32 v3, 32, v0 -; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 48, v0 -; GCN-NEXT: v_add_u32_e32 v1, 56, v0 -; GCN-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v4, 52, v0 -; GCN-NEXT: v_add_u32_e32 v5, 60, v0 -; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x44, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x4c, v0 -; GCN-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 64, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x48, v0 -; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x50, v0 -; GCN-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x58, v0 -; GCN-NEXT: v_add_u32_e32 v1, 0x60, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0x54, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0x5c, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0x64, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0x6c, v0 -; GCN-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v6, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x68, v0 -; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x70, v0 -; GCN-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v7, 0x74, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0x7c, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x78, v0 -; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v8, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x84, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x8c, v0 -; GCN-NEXT: buffer_store_dword v36, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x80, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x88, v0 -; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x90, v0 -; GCN-NEXT: buffer_store_dword v37, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x98, v0 -; GCN-NEXT: v_add_u32_e32 v1, 0xa0, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0x94, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0x9c, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xa4, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xac, v0 -; GCN-NEXT: buffer_store_dword v41, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v6, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xa8, v0 +; GCN-NEXT: buffer_store_dword v13, v61, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v62, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x44, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0x4c, v3 +; GCN-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v43, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 -; GCN-NEXT: buffer_store_dword v45, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v7, 0xb4, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0xbc, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0xb8, v0 -; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v2, 0xc8, v0 -; GCN-NEXT: v_add_u32_e32 v1, 0xc0, v0 +; GCN-NEXT: v_add_u32_e32 v0, 64, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0x48, v3 +; GCN-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x50, v3 +; GCN-NEXT: buffer_store_dword v42, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x58, v3 +; GCN-NEXT: v_add_u32_e32 v0, 0x60, v3 +; GCN-NEXT: v_add_u32_e32 v4, 0x54, v3 +; GCN-NEXT: v_add_u32_e32 v5, 0x5c, v3 +; GCN-NEXT: v_add_u32_e32 v6, 0x64, v3 +; GCN-NEXT: v_add_u32_e32 v7, 0x6c, v3 +; GCN-NEXT: buffer_store_dword v38, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v37, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v39, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v33, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v35, v7, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x68, v3 +; GCN-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x70, v3 +; GCN-NEXT: buffer_store_dword v34, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v12, 0x74, v3 +; GCN-NEXT: v_add_u32_e32 v13, 0x7c, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0x78, v3 +; GCN-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s33 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s33 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s33 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v0, 0x84, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0x8c, v3 +; GCN-NEXT: v_add_u32_e32 v4, 0x94, v3 +; GCN-NEXT: v_add_u32_e32 v5, 0x9c, v3 +; GCN-NEXT: v_add_u32_e32 v6, 0xa4, v3 +; GCN-NEXT: v_add_u32_e32 v7, 0xac, v3 +; GCN-NEXT: v_add_u32_e32 v12, 0xb4, v3 +; GCN-NEXT: v_add_u32_e32 v13, 0xbc, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_u32_e32 v7, 0xec, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0xf4, v0 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v10, v4 -; GCN-NEXT: v_add_u32_e32 v2, 0xc4, v0 -; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v9, v3 -; GCN-NEXT: v_mov_b32_e32 v11, v5 -; GCN-NEXT: v_add_u32_e32 v3, 0xcc, v0 -; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v2, 8, v0 -; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 -; GCN-NEXT: v_add_u32_e32 v3, 12, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xd4, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xdc, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xe4, v0 -; GCN-NEXT: v_add_u32_e32 v9, 0xfc, v0 +; GCN-NEXT: v_mov_b32_e32 v14, v24 +; GCN-NEXT: v_mov_b32_e32 v15, v25 +; GCN-NEXT: v_mov_b32_e32 v17, v27 +; GCN-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x80, v3 +; GCN-NEXT: v_mov_b32_e32 v16, v26 +; GCN-NEXT: v_add_u32_e32 v1, 0x88, v3 +; GCN-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0x90, v3 +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x98, v3 +; GCN-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v49, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v51, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v0, 0xa0, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0xa8, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 4, v0 -; GCN-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:256 -; GCN-NEXT: v_add_u32_e32 v2, 0xd8, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xe0, v0 -; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xe8, v0 -; GCN-NEXT: buffer_store_dword v53, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xf0, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xf8, v0 -; GCN-NEXT: buffer_store_dword v57, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: v_mov_b32_e32 v14, v32 +; GCN-NEXT: v_mov_b32_e32 v15, v33 +; GCN-NEXT: v_mov_b32_e32 v17, v35 +; GCN-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v7, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v16, v34 +; GCN-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xb0, v3 +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xb8, v3 +; GCN-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v1, 0xc8, v3 +; GCN-NEXT: v_add_u32_e32 v0, 0xc0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 31, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GCN-NEXT: v_add_u32_e32 v0, v0, v1 -; GCN-NEXT: v_add_u32_e32 v1, 4, v0 -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v4, v12 +; GCN-NEXT: v_mov_b32_e32 v7, v15 +; GCN-NEXT: v_mov_b32_e32 v6, v14 +; GCN-NEXT: v_mov_b32_e32 v5, v13 +; GCN-NEXT: v_mov_b32_e32 v17, v7 +; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v15, v5 +; GCN-NEXT: v_add_u32_e32 v1, 0xc4, v3 +; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v14, v4 +; GCN-NEXT: v_mov_b32_e32 v16, v6 +; GCN-NEXT: v_add_u32_e32 v4, 0xcc, v3 +; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 8, v3 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 4, v3 +; GCN-NEXT: v_add_u32_e32 v4, 12, v3 +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v0, 0xd0, v3 +; GCN-NEXT: v_add_u32_e32 v4, 0xe0, v3 +; GCN-NEXT: v_add_u32_e32 v1, 0xd8, v3 +; GCN-NEXT: buffer_store_dword v52, v4, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v5, 0xd4, v3 +; GCN-NEXT: v_add_u32_e32 v6, 0xdc, v3 +; GCN-NEXT: v_add_u32_e32 v7, 0xe4, v3 +; GCN-NEXT: v_add_u32_e32 v12, 0xec, v3 +; GCN-NEXT: v_add_u32_e32 v13, 0xf4, v3 +; GCN-NEXT: v_add_u32_e32 v14, 0xfc, v3 +; GCN-NEXT: v_add_u32_e32 v4, 0xf8, v3 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mov_b32_e32 v8, v19 +; GCN-NEXT: v_mov_b32_e32 v10, v21 +; GCN-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v0, 0xe8, v3 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v9, v20 +; GCN-NEXT: v_mov_b32_e32 v11, v22 +; GCN-NEXT: v_add_u32_e32 v1, 0xf0, v3 +; GCN-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v56, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v58, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v53, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v55, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v59, v14, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v0, 31, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_add_u32_e32 v2, v3, v0 +; GCN-NEXT: v_add_u32_e32 v3, 4, v2 +; GCN-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -60,15 +60,15 @@ ; ; GFX8-LABEL: extractelement_vgpr_v4i128_sgpr_idx: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1] -; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[6:7] +; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[10:11] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[10:11] +; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[18:19] ; GFX8-NEXT: flat_load_dwordx4 v[14:17], v[0:1] ; GFX8-NEXT: s_lshl_b32 s0, s2, 1 ; GFX8-NEXT: s_lshl_b32 m0, s0, 1 @@ -113,57 +113,57 @@ ; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 1, v2 -; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX9-NEXT: v_add_u32_e32 v17, 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e64 v10, v2, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v8, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v9, v11, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, v13, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v10, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v17, v11, v7, vcc ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v3 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v14, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -173,8 +173,8 @@ ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[3:4] ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 1, v2 +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[3:4] ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v16 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -1672,7 +1672,7 @@ ; GPRIDX-NEXT: is_ptr64 = 1 ; GPRIDX-NEXT: is_dynamic_callstack = 0 ; GPRIDX-NEXT: is_debug_enabled = 0 -; GPRIDX-NEXT: is_xnack_enabled = 0 +; GPRIDX-NEXT: is_xnack_enabled = 1 ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 @@ -1764,7 +1764,7 @@ ; MOVREL-NEXT: is_ptr64 = 1 ; MOVREL-NEXT: is_dynamic_callstack = 0 ; MOVREL-NEXT: is_debug_enabled = 0 -; MOVREL-NEXT: is_xnack_enabled = 0 +; MOVREL-NEXT: is_xnack_enabled = 1 ; MOVREL-NEXT: workitem_private_segment_byte_size = 0 ; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 ; MOVREL-NEXT: gds_segment_byte_size = 0 @@ -2187,7 +2187,7 @@ ; GPRIDX-NEXT: is_ptr64 = 1 ; GPRIDX-NEXT: is_dynamic_callstack = 0 ; GPRIDX-NEXT: is_debug_enabled = 0 -; GPRIDX-NEXT: is_xnack_enabled = 0 +; GPRIDX-NEXT: is_xnack_enabled = 1 ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 @@ -2272,7 +2272,7 @@ ; MOVREL-NEXT: is_ptr64 = 1 ; MOVREL-NEXT: is_dynamic_callstack = 0 ; MOVREL-NEXT: is_debug_enabled = 0 -; MOVREL-NEXT: is_xnack_enabled = 0 +; MOVREL-NEXT: is_xnack_enabled = 1 ; MOVREL-NEXT: workitem_private_segment_byte_size = 0 ; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 ; MOVREL-NEXT: gds_segment_byte_size = 0 @@ -2363,7 +2363,7 @@ ; GPRIDX-NEXT: is_ptr64 = 1 ; GPRIDX-NEXT: is_dynamic_callstack = 0 ; GPRIDX-NEXT: is_debug_enabled = 0 -; GPRIDX-NEXT: is_xnack_enabled = 0 +; GPRIDX-NEXT: is_xnack_enabled = 1 ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 @@ -2451,7 +2451,7 @@ ; MOVREL-NEXT: is_ptr64 = 1 ; MOVREL-NEXT: is_dynamic_callstack = 0 ; MOVREL-NEXT: is_debug_enabled = 0 -; MOVREL-NEXT: is_xnack_enabled = 0 +; MOVREL-NEXT: is_xnack_enabled = 1 ; MOVREL-NEXT: workitem_private_segment_byte_size = 0 ; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 ; MOVREL-NEXT: gds_segment_byte_size = 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -41,13 +41,13 @@ ; ; VI-LABEL: frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_add_u32 s0, s8, 8 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_addc_u32 s1, s9, 0 +; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: s_add_u32 s0, s4, 8 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: flat_load_ushort v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -62,8 +62,8 @@ ; VI-NEXT: v_div_fixup_f16 v1, v1, v0, v2 ; VI-NEXT: v_trunc_f16_e32 v1, v1 ; VI-NEXT: v_fma_f16 v2, -v1, v0, v2 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 @@ -101,13 +101,13 @@ ; ; VI-LABEL: fast_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_add_u32 s0, s8, 8 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_addc_u32 s1, s9, 0 +; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: s_add_u32 s0, s4, 8 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: flat_load_ushort v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -117,8 +117,8 @@ ; VI-NEXT: v_mul_f16_e32 v1, v2, v1 ; VI-NEXT: v_trunc_f16_e32 v1, v1 ; VI-NEXT: v_fma_f16 v2, -v1, v0, v2 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 @@ -168,13 +168,13 @@ ; ; VI-LABEL: unsafe_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_add_u32 s0, s8, 8 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_addc_u32 s1, s9, 0 +; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: s_add_u32 s0, s4, 8 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: flat_load_ushort v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -189,8 +189,8 @@ ; VI-NEXT: v_div_fixup_f16 v1, v1, v0, v2 ; VI-NEXT: v_trunc_f16_e32 v1, v1 ; VI-NEXT: v_fma_f16 v2, -v1, v0, v2 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 @@ -235,6 +235,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[8:9], 0x10 @@ -290,6 +291,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[6:7], 0x0 ; VI-NEXT: s_load_dword s1, s[8:9], 0x10 @@ -345,6 +347,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[8:9], 0x10 @@ -410,6 +413,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 @@ -473,6 +477,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 @@ -536,6 +541,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 @@ -628,6 +634,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[6:7], 0x0 ; VI-NEXT: s_load_dword s1, s[8:9], 0x10 @@ -770,6 +777,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x20 @@ -881,6 +889,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x20 @@ -1010,10 +1019,11 @@ ; VI-LABEL: frem_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 +; VI-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x40 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s0 @@ -1139,12 +1149,13 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 +; VI-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x40 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1] ; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1] ; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -1158,8 +1169,8 @@ ; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1] ; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] ; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 ; VI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], s[2:3] ; VI-NEXT: v_div_scale_f64 v[10:11], vcc, s[2:3], v[2:3], s[2:3] ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -12,7 +12,7 @@ ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GCN-NEXT: v_mov_b32_e32 v0, 0x100 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: v_add_u32_e32 v1, 4, v0 +; GCN-NEXT: v_add_u32_e32 v16, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x0 ; GCN-NEXT: s_load_dwordx16 s[68:83], s[10:11], 0x40 @@ -20,224 +20,225 @@ ; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0xc0 ; GCN-NEXT: s_movk_i32 s4, 0x50 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, s13 -; GCN-NEXT: v_mov_b32_e32 v5, s14 -; GCN-NEXT: v_mov_b32_e32 v6, s15 -; GCN-NEXT: v_mov_b32_e32 v8, s16 -; GCN-NEXT: v_mov_b32_e32 v10, s17 -; GCN-NEXT: v_mov_b32_e32 v12, s18 -; GCN-NEXT: v_mov_b32_e32 v14, s19 +; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NEXT: v_mov_b32_e32 v4, s16 +; GCN-NEXT: v_mov_b32_e32 v5, s17 +; GCN-NEXT: v_mov_b32_e32 v6, s18 +; GCN-NEXT: v_mov_b32_e32 v7, s19 ; GCN-NEXT: s_movk_i32 s5, 0x60 -; GCN-NEXT: v_add_u32_e32 v2, 8, v0 -; GCN-NEXT: v_add_u32_e32 v3, 12, v0 -; GCN-NEXT: v_add_u32_e32 v7, 16, v0 -; GCN-NEXT: v_add_u32_e32 v9, 20, v0 -; GCN-NEXT: v_add_u32_e32 v11, 24, v0 -; GCN-NEXT: v_add_u32_e32 v13, 28, v0 -; GCN-NEXT: v_add_u32_e32 v15, 32, v0 -; GCN-NEXT: v_mov_b32_e32 v16, s20 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v17, 36, v0 -; GCN-NEXT: v_mov_b32_e32 v18, s21 -; GCN-NEXT: v_mov_b32_e32 v26, s25 +; GCN-NEXT: v_add_u32_e32 v17, 8, v0 +; GCN-NEXT: v_add_u32_e32 v18, 12, v0 +; GCN-NEXT: v_add_u32_e32 v19, 16, v0 +; GCN-NEXT: v_add_u32_e32 v20, 20, v0 +; GCN-NEXT: v_add_u32_e32 v21, 24, v0 +; GCN-NEXT: v_add_u32_e32 v22, 28, v0 +; GCN-NEXT: v_add_u32_e32 v23, 32, v0 +; GCN-NEXT: v_mov_b32_e32 v8, s20 +; GCN-NEXT: buffer_store_dword v1, v16, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v23, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v24, 36, v0 +; GCN-NEXT: v_mov_b32_e32 v9, s21 +; GCN-NEXT: v_mov_b32_e32 v32, s68 ; GCN-NEXT: v_add_u32_e32 v33, 0x44, v0 ; GCN-NEXT: v_mov_b32_e32 v34, s69 -; GCN-NEXT: v_mov_b32_e32 v4, s71 -; GCN-NEXT: v_add_u32_e32 v19, 40, v0 -; GCN-NEXT: v_mov_b32_e32 v20, s22 -; GCN-NEXT: v_add_u32_e32 v21, 44, v0 -; GCN-NEXT: v_mov_b32_e32 v22, s23 -; GCN-NEXT: v_add_u32_e32 v23, 48, v0 -; GCN-NEXT: v_mov_b32_e32 v24, s24 -; GCN-NEXT: v_add_u32_e32 v25, 52, v0 -; GCN-NEXT: v_add_u32_e32 v27, 56, v0 -; GCN-NEXT: v_mov_b32_e32 v28, s26 -; GCN-NEXT: v_add_u32_e32 v29, 60, v0 -; GCN-NEXT: v_mov_b32_e32 v30, s27 +; GCN-NEXT: v_mov_b32_e32 v1, s71 +; GCN-NEXT: s_movk_i32 s13, 0x70 +; GCN-NEXT: v_add_u32_e32 v25, 40, v0 +; GCN-NEXT: v_mov_b32_e32 v10, s22 +; GCN-NEXT: v_add_u32_e32 v26, 44, v0 +; GCN-NEXT: v_mov_b32_e32 v11, s23 +; GCN-NEXT: v_add_u32_e32 v27, 48, v0 +; GCN-NEXT: v_mov_b32_e32 v12, s24 +; GCN-NEXT: v_add_u32_e32 v28, 52, v0 +; GCN-NEXT: v_mov_b32_e32 v13, s25 +; GCN-NEXT: v_add_u32_e32 v29, 56, v0 +; GCN-NEXT: v_mov_b32_e32 v14, s26 +; GCN-NEXT: v_add_u32_e32 v30, 60, v0 +; GCN-NEXT: v_mov_b32_e32 v15, s27 ; GCN-NEXT: v_add_u32_e32 v31, 64, v0 -; GCN-NEXT: v_mov_b32_e32 v32, s68 -; GCN-NEXT: buffer_store_dword v18, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v30, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v32, v31, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s13, 0x70 ; GCN-NEXT: v_add_u32_e32 v35, 0x48, v0 ; GCN-NEXT: v_mov_b32_e32 v36, s70 ; GCN-NEXT: v_add_u32_e32 v37, 0x4c, v0 ; GCN-NEXT: v_add_u32_e32 v38, s4, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s72 +; GCN-NEXT: v_mov_b32_e32 v2, s72 ; GCN-NEXT: v_add_u32_e32 v39, 0x54, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s73 +; GCN-NEXT: v_mov_b32_e32 v3, s73 ; GCN-NEXT: v_add_u32_e32 v40, 0x58, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s74 +; GCN-NEXT: v_mov_b32_e32 v4, s74 ; GCN-NEXT: v_add_u32_e32 v41, 0x5c, v0 -; GCN-NEXT: v_mov_b32_e32 v10, s75 +; GCN-NEXT: v_mov_b32_e32 v5, s75 ; GCN-NEXT: v_add_u32_e32 v42, s5, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s76 +; GCN-NEXT: v_mov_b32_e32 v6, s76 ; GCN-NEXT: buffer_store_dword v34, v33, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v36, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v42, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v26, 0x64, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s77 -; GCN-NEXT: v_mov_b32_e32 v4, s81 +; GCN-NEXT: buffer_store_dword v1, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v42, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v32, 0x64, v0 +; GCN-NEXT: v_mov_b32_e32 v7, s77 +; GCN-NEXT: v_mov_b32_e32 v1, s81 ; GCN-NEXT: s_movk_i32 s14, 0x90 ; GCN-NEXT: s_movk_i32 s15, 0xa0 -; GCN-NEXT: v_add_u32_e32 v28, 0x68, v0 -; GCN-NEXT: v_mov_b32_e32 v16, s78 -; GCN-NEXT: v_add_u32_e32 v30, 0x6c, v0 -; GCN-NEXT: v_mov_b32_e32 v18, s79 -; GCN-NEXT: v_add_u32_e32 v32, s13, v0 -; GCN-NEXT: v_mov_b32_e32 v20, s80 +; GCN-NEXT: v_add_u32_e32 v43, 0x68, v0 +; GCN-NEXT: v_mov_b32_e32 v8, s78 +; GCN-NEXT: v_add_u32_e32 v44, 0x6c, v0 +; GCN-NEXT: v_mov_b32_e32 v9, s79 +; GCN-NEXT: v_add_u32_e32 v45, s13, v0 +; GCN-NEXT: v_mov_b32_e32 v10, s80 ; GCN-NEXT: v_add_u32_e32 v34, 0x74, v0 ; GCN-NEXT: v_add_u32_e32 v36, 0x78, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s82 -; GCN-NEXT: v_add_u32_e32 v43, 0x7c, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s83 -; GCN-NEXT: v_add_u32_e32 v44, 0x80, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s52 -; GCN-NEXT: buffer_store_dword v14, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v44, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v45, 0x84, v0 -; GCN-NEXT: v_mov_b32_e32 v4, s53 +; GCN-NEXT: v_mov_b32_e32 v2, s82 +; GCN-NEXT: v_add_u32_e32 v46, 0x7c, v0 +; GCN-NEXT: v_mov_b32_e32 v3, s83 +; GCN-NEXT: v_add_u32_e32 v47, 0x80, v0 +; GCN-NEXT: v_mov_b32_e32 v4, s52 +; GCN-NEXT: buffer_store_dword v7, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v44, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v45, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v46, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v47, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v48, 0x84, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s53 ; GCN-NEXT: s_movk_i32 s16, 0xb0 -; GCN-NEXT: v_add_u32_e32 v46, 0x88, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s54 -; GCN-NEXT: v_add_u32_e32 v47, 0x8c, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s55 -; GCN-NEXT: v_add_u32_e32 v48, s14, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s56 -; GCN-NEXT: v_add_u32_e32 v49, 0x94, v0 -; GCN-NEXT: v_mov_b32_e32 v10, s57 -; GCN-NEXT: v_add_u32_e32 v50, 0x98, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s58 -; GCN-NEXT: v_add_u32_e32 v51, 0x9c, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s59 -; GCN-NEXT: v_add_u32_e32 v52, s15, v0 -; GCN-NEXT: v_mov_b32_e32 v16, s60 -; GCN-NEXT: buffer_store_dword v4, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v52, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v53, 0xa4, v0 -; GCN-NEXT: v_mov_b32_e32 v4, s61 +; GCN-NEXT: v_add_u32_e32 v49, 0x88, v0 +; GCN-NEXT: v_mov_b32_e32 v2, s54 +; GCN-NEXT: v_add_u32_e32 v50, 0x8c, v0 +; GCN-NEXT: v_mov_b32_e32 v3, s55 +; GCN-NEXT: v_add_u32_e32 v51, s14, v0 +; GCN-NEXT: v_mov_b32_e32 v4, s56 +; GCN-NEXT: v_add_u32_e32 v52, 0x94, v0 +; GCN-NEXT: v_mov_b32_e32 v5, s57 +; GCN-NEXT: v_add_u32_e32 v53, 0x98, v0 +; GCN-NEXT: v_mov_b32_e32 v6, s58 +; GCN-NEXT: v_add_u32_e32 v54, 0x9c, v0 +; GCN-NEXT: v_mov_b32_e32 v7, s59 +; GCN-NEXT: v_add_u32_e32 v55, s15, v0 +; GCN-NEXT: v_mov_b32_e32 v8, s60 +; GCN-NEXT: buffer_store_dword v1, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v55, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v56, 0xa4, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s61 ; GCN-NEXT: s_movk_i32 s17, 0xd0 ; GCN-NEXT: s_movk_i32 s18, 0xe0 -; GCN-NEXT: v_add_u32_e32 v54, 0xa8, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s62 -; GCN-NEXT: v_add_u32_e32 v55, 0xac, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s63 -; GCN-NEXT: v_add_u32_e32 v56, s16, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s64 -; GCN-NEXT: v_add_u32_e32 v57, 0xb4, v0 -; GCN-NEXT: v_mov_b32_e32 v10, s65 -; GCN-NEXT: v_add_u32_e32 v58, 0xb8, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s66 -; GCN-NEXT: v_add_u32_e32 v59, 0xbc, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s67 -; GCN-NEXT: v_add_u32_e32 v60, 0xc0, v0 -; GCN-NEXT: v_mov_b32_e32 v16, s36 -; GCN-NEXT: buffer_store_dword v4, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v60, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v61, 0xc4, v0 -; GCN-NEXT: v_mov_b32_e32 v4, s37 +; GCN-NEXT: v_add_u32_e32 v57, 0xa8, v0 +; GCN-NEXT: v_mov_b32_e32 v2, s62 +; GCN-NEXT: v_add_u32_e32 v58, 0xac, v0 +; GCN-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NEXT: v_add_u32_e32 v59, s16, v0 +; GCN-NEXT: v_mov_b32_e32 v4, s64 +; GCN-NEXT: v_add_u32_e32 v60, 0xb4, v0 +; GCN-NEXT: v_mov_b32_e32 v5, s65 +; GCN-NEXT: v_add_u32_e32 v61, 0xb8, v0 +; GCN-NEXT: v_mov_b32_e32 v6, s66 +; GCN-NEXT: v_add_u32_e32 v62, 0xbc, v0 +; GCN-NEXT: v_mov_b32_e32 v7, s67 +; GCN-NEXT: v_add_u32_e32 v63, 0xc0, v0 +; GCN-NEXT: v_mov_b32_e32 v8, s36 +; GCN-NEXT: buffer_store_dword v1, v56, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v57, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v58, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v59, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v60, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v61, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v63, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v64, 0xc4, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: s_and_b32 s7, s7, 63 ; GCN-NEXT: s_movk_i32 s19, 0xf0 -; GCN-NEXT: v_add_u32_e32 v62, 0xc8, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s38 -; GCN-NEXT: v_add_u32_e32 v63, 0xcc, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s39 -; GCN-NEXT: v_add_u32_e32 v64, s17, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s40 -; GCN-NEXT: v_add_u32_e32 v65, 0xd4, v0 -; GCN-NEXT: v_mov_b32_e32 v10, s41 -; GCN-NEXT: v_add_u32_e32 v66, 0xd8, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s42 -; GCN-NEXT: v_add_u32_e32 v67, 0xdc, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s43 -; GCN-NEXT: v_add_u32_e32 v68, s18, v0 -; GCN-NEXT: v_mov_b32_e32 v16, s44 -; GCN-NEXT: buffer_store_dword v4, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v63, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v64, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v65, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v66, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v67, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v68, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v69, 0xe4, v0 -; GCN-NEXT: v_mov_b32_e32 v4, s45 -; GCN-NEXT: v_add_u32_e32 v70, 0xe8, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s46 -; GCN-NEXT: v_add_u32_e32 v71, 0xec, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s47 -; GCN-NEXT: v_add_u32_e32 v72, s19, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s48 -; GCN-NEXT: v_add_u32_e32 v73, 0xf4, v0 -; GCN-NEXT: v_mov_b32_e32 v10, s49 -; GCN-NEXT: v_add_u32_e32 v74, 0xf8, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s50 -; GCN-NEXT: buffer_store_dword v4, v69, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v70, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v71, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v72, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v73, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v74, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NEXT: v_add_u32_e32 v65, 0xc8, v0 +; GCN-NEXT: v_mov_b32_e32 v2, s38 +; GCN-NEXT: v_add_u32_e32 v66, 0xcc, v0 +; GCN-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NEXT: v_add_u32_e32 v67, s17, v0 +; GCN-NEXT: v_mov_b32_e32 v4, s40 +; GCN-NEXT: v_add_u32_e32 v68, 0xd4, v0 +; GCN-NEXT: v_mov_b32_e32 v5, s41 +; GCN-NEXT: v_add_u32_e32 v69, 0xd8, v0 +; GCN-NEXT: v_mov_b32_e32 v6, s42 +; GCN-NEXT: v_add_u32_e32 v70, 0xdc, v0 +; GCN-NEXT: v_mov_b32_e32 v7, s43 +; GCN-NEXT: v_add_u32_e32 v71, s18, v0 +; GCN-NEXT: v_mov_b32_e32 v8, s44 +; GCN-NEXT: buffer_store_dword v1, v64, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v65, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v66, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v67, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v68, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v69, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v70, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v71, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v72, 0xe4, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NEXT: v_add_u32_e32 v73, 0xe8, v0 +; GCN-NEXT: v_mov_b32_e32 v2, s46 +; GCN-NEXT: v_add_u32_e32 v74, 0xec, v0 +; GCN-NEXT: v_mov_b32_e32 v3, s47 +; GCN-NEXT: v_add_u32_e32 v75, s19, v0 +; GCN-NEXT: v_mov_b32_e32 v4, s48 +; GCN-NEXT: v_add_u32_e32 v76, 0xf4, v0 +; GCN-NEXT: v_mov_b32_e32 v5, s49 +; GCN-NEXT: v_add_u32_e32 v77, 0xf8, v0 +; GCN-NEXT: v_mov_b32_e32 v6, s50 +; GCN-NEXT: buffer_store_dword v1, v72, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v73, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v74, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v75, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v76, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v77, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s12 ; GCN-NEXT: s_lshl_b32 s7, s7, 2 -; GCN-NEXT: v_add_u32_e32 v75, 0xfc, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s51 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:256 -; GCN-NEXT: buffer_store_dword v14, v75, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: v_add_u32_e32 v78, 0xfc, v0 +; GCN-NEXT: v_mov_b32_e32 v7, s51 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:256 +; GCN-NEXT: buffer_store_dword v7, v78, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: v_add_u32_e32 v0, s7, v0 -; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v5, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v6, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v8, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v9, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v10, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v11, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v12, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v13, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v14, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v15, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, v16, s[0:3], 0 offen +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v2, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v3, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v4, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v5, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v6, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v8, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v9, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v10, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v11, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v12, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v13, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v14, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v15, v30, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v16, v31, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v17, v33, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v18, v35, s[0:3], 0 offen @@ -247,45 +248,46 @@ ; GCN-NEXT: buffer_load_dword v22, v40, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v23, v41, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v24, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v25, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v26, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v27, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v28, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v25, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v26, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v27, v44, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v28, v45, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v29, v34, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v30, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v31, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v32, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v33, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v34, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v35, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v36, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v37, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v38, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v39, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v40, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v41, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v42, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v43, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v44, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v45, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v46, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v47, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v48, v60, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v49, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v50, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v51, v63, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v52, v64, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v53, v65, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v54, v66, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v55, v67, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v56, v68, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v57, v69, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v58, v70, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, v71, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, v72, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v61, v73, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, v74, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, v75, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v31, v46, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v32, v47, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v33, v48, s[0:3], 0 offen +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v34, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v35, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v36, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v37, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v38, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v39, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v40, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v41, v56, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v42, v57, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v43, v58, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v44, v59, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v45, v60, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v46, v61, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v47, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v48, v63, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v49, v64, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v50, v65, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v51, v66, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v52, v67, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v53, v68, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v54, v69, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v55, v70, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v56, v71, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v57, v72, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v58, v73, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v59, v74, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v60, v75, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v61, v76, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v62, v77, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, v78, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:256 ; GCN-NEXT: s_add_u32 s6, s8, 16 ; GCN-NEXT: s_addc_u32 s7, s9, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -2136,8 +2136,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v16i16_s_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: s_and_b32 s1, s3, 1 ; GFX9-NEXT: s_lshr_b32 s12, s3, 1 ; GFX9-NEXT: s_mov_b32 s0, 0xffff @@ -2155,29 +2155,29 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX9-NEXT: v_and_or_b32 v10, v1, s13, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[10:11] +; GFX9-NEXT: v_and_or_b32 v12, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] ; GFX9-NEXT: s_add_u32 s0, 0, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] -; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v11, s1 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v10, s0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off @@ -2185,10 +2185,10 @@ ; ; GFX8-LABEL: insertelement_v_v16i16_s_s: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[2:3] ; GFX8-NEXT: s_and_b32 s1, s3, 1 ; GFX8-NEXT: s_lshr_b32 s12, s3, 1 ; GFX8-NEXT: s_mov_b32 s0, 0xffff @@ -2205,30 +2205,30 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 ; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; GFX8-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v3, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v4, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v5, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[10:11] -; GFX8-NEXT: v_and_b32_e32 v8, s14, v8 -; GFX8-NEXT: v_or_b32_e32 v8, s13, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[10:11] +; GFX8-NEXT: v_and_b32_e32 v0, s14, v0 +; GFX8-NEXT: v_or_b32_e32 v12, s13, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] ; GFX8-NEXT: s_add_u32 s0, 0, 16 -; GFX8-NEXT: s_addc_u32 s1, 0, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v11, s1 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v10, s0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -2855,8 +2855,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v16i16_s_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX9-NEXT: s_mov_b32 s0, 0xffff @@ -2874,28 +2874,28 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] -; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] +; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] ; GFX9-NEXT: s_add_u32 s0, 0, 16 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_addc_u32 s1, 0, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v11, s1 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v10, s0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off @@ -2903,10 +2903,10 @@ ; ; GFX8-LABEL: insertelement_v_v16i16_s_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[12:13] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX8-NEXT: s_mov_b32 s0, 0xffff @@ -2924,29 +2924,29 @@ ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; GFX8-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] -; GFX8-NEXT: v_and_b32_e32 v1, v11, v1 -; GFX8-NEXT: v_or_b32_e32 v11, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] +; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v12, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] ; GFX8-NEXT: s_add_u32 s0, 0, 16 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_addc_u32 s1, 0, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v11, s1 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v10, s0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -3007,8 +3007,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v16i16_v_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: s_and_b32 s1, s2, 1 ; GFX9-NEXT: s_lshr_b32 s12, s2, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 @@ -3024,29 +3024,29 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] -; GFX9-NEXT: v_and_or_b32 v11, v1, s13, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX9-NEXT: s_add_u32 s0, 0, 16 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[10:11] +; GFX9-NEXT: v_and_or_b32 v12, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] +; GFX9-NEXT: s_add_u32 s0, 0, 16 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_addc_u32 s1, 0, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v11, s1 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v10, s0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off @@ -3054,10 +3054,10 @@ ; ; GFX8-LABEL: insertelement_v_v16i16_v_s: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[12:13] ; GFX8-NEXT: s_and_b32 s1, s2, 1 ; GFX8-NEXT: s_lshr_b32 s12, s2, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 @@ -3074,30 +3074,30 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[10:11] ; GFX8-NEXT: v_and_b32_e32 v1, s13, v1 -; GFX8-NEXT: v_or_b32_e32 v11, v1, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX8-NEXT: s_add_u32 s0, 0, 16 +; GFX8-NEXT: v_or_b32_e32 v12, v1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] +; GFX8-NEXT: s_add_u32 s0, 0, 16 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_addc_u32 s1, 0, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v11, s1 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v10, s0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -3158,8 +3158,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v16i16_v_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 @@ -3176,28 +3176,28 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] ; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] ; GFX9-NEXT: s_add_u32 s0, 0, 16 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_addc_u32 s1, 0, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v11, s1 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v10, s0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off @@ -3205,10 +3205,10 @@ ; ; GFX8-LABEL: insertelement_v_v16i16_v_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[12:13] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 @@ -3225,29 +3225,29 @@ ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] ; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_or_b32_e32 v12, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] ; GFX8-NEXT: s_add_u32 s0, 0, 16 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_addc_u32 s1, 0, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v11, s1 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v10, s0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-local.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-local.mir @@ -15,14 +15,6 @@ bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX6-LABEL: name: atomic_cmpxchg_s32_local - ; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] ; GFX7-LABEL: name: atomic_cmpxchg_s32_local ; GFX7: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -38,6 +30,14 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[DS_CMPST_RTN_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_CMPST_RTN_B32_gfx9_]] + ; GFX6-LABEL: name: atomic_cmpxchg_s32_local + ; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -55,16 +55,6 @@ bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX6-LABEL: name: atomic_cmpxchg_s32_local_gep4 - ; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec - ; GFX6: %4:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 %4, [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] ; GFX7-LABEL: name: atomic_cmpxchg_s32_local_gep4 ; GFX7: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -80,6 +70,16 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[DS_CMPST_RTN_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 4, 0, implicit $exec :: (load store seq_cst 4, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_CMPST_RTN_B32_gfx9_]] + ; GFX6-LABEL: name: atomic_cmpxchg_s32_local_gep4 + ; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec + ; GFX6: %4:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 %4, [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -99,14 +99,6 @@ bb.0: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 - ; GFX6-LABEL: name: atomic_cmpxchg_s64_local - ; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 - ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] ; GFX7-LABEL: name: atomic_cmpxchg_s64_local ; GFX7: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -122,6 +114,14 @@ ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 ; GFX9: [[DS_CMPST_RTN_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_gfx9_]] + ; GFX6-LABEL: name: atomic_cmpxchg_s64_local + ; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s64) = COPY $vgpr1_vgpr2 %2:vgpr(s64) = COPY $vgpr3_vgpr4 @@ -139,14 +139,6 @@ bb.0: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 - ; GFX6-LABEL: name: atomic_cmpxchg_s64_local_gep4 - ; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 - ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] ; GFX7-LABEL: name: atomic_cmpxchg_s64_local_gep4 ; GFX7: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -162,6 +154,14 @@ ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 ; GFX9: [[DS_CMPST_RTN_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_gfx9_]] + ; GFX6-LABEL: name: atomic_cmpxchg_s64_local_gep4 + ; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s64) = COPY $vgpr1_vgpr2 %2:vgpr(s64) = COPY $vgpr3_vgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-region.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-region.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-region.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-region.mir @@ -15,14 +15,6 @@ bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX6-LABEL: name: atomic_cmpxchg_s32_region - ; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) - ; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] ; GFX7-LABEL: name: atomic_cmpxchg_s32_region ; GFX7: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -38,6 +30,14 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) ; GFX9: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + ; GFX6-LABEL: name: atomic_cmpxchg_s32_region + ; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] %0:vgpr(p2) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -55,16 +55,6 @@ bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX6-LABEL: name: atomic_cmpxchg_s32_region_gep4 - ; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec - ; GFX6: %4:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 %4, [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) - ; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] ; GFX7-LABEL: name: atomic_cmpxchg_s32_region_gep4 ; GFX7: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -80,6 +70,16 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) ; GFX9: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + ; GFX6-LABEL: name: atomic_cmpxchg_s32_region_gep4 + ; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec + ; GFX6: %4:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 %4, [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] %0:vgpr(p2) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -99,14 +99,6 @@ bb.0: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 - ; GFX6-LABEL: name: atomic_cmpxchg_s64_region - ; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 - ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) - ; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] ; GFX7-LABEL: name: atomic_cmpxchg_s64_region ; GFX7: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -122,6 +114,14 @@ ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 ; GFX9: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + ; GFX6-LABEL: name: atomic_cmpxchg_s64_region + ; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] %0:vgpr(p2) = COPY $vgpr0 %1:vgpr(s64) = COPY $vgpr1_vgpr2 %2:vgpr(s64) = COPY $vgpr3_vgpr4 @@ -139,14 +139,6 @@ bb.0: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 - ; GFX6-LABEL: name: atomic_cmpxchg_s64_region_gep4 - ; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 - ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) - ; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] ; GFX7-LABEL: name: atomic_cmpxchg_s64_region_gep4 ; GFX7: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -162,6 +154,14 @@ ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 ; GFX9: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + ; GFX6-LABEL: name: atomic_cmpxchg_s64_region_gep4 + ; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] %0:vgpr(p2) = COPY $vgpr0 %1:vgpr(s64) = COPY $vgpr1_vgpr2 %2:vgpr(s64) = COPY $vgpr3_vgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir @@ -14,6 +14,12 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7-LABEL: name: global_atomicrmw_add_s32 + ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX6-LABEL: name: global_atomicrmw_add_s32 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -25,12 +31,6 @@ ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] - ; GFX7-LABEL: name: global_atomicrmw_add_s32 - ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) - ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX9-LABEL: name: global_atomicrmw_add_s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -60,6 +60,11 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7-LABEL: name: global_atomicrmw_add_s32_nortn + ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX6-LABEL: name: global_atomicrmw_add_s32_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -70,11 +75,6 @@ ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) - ; GFX7-LABEL: name: global_atomicrmw_add_s32_nortn - ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX9-LABEL: name: global_atomicrmw_add_s32_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -101,17 +101,6 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2047 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) - ; GFX6: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2047 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -128,6 +117,17 @@ ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2047 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX6: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2047 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -159,16 +159,6 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -184,6 +174,16 @@ ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -212,17 +212,6 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2048 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) - ; GFX6: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2048 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -239,6 +228,17 @@ ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2048 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX6: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2048 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -280,16 +280,6 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -305,6 +295,16 @@ ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -343,17 +343,6 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4095 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) - ; GFX6: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4095 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -370,6 +359,17 @@ ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4095 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX6: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4095 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -411,16 +411,6 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -436,6 +426,16 @@ ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -474,18 +474,6 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4097 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4097 - ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) - ; GFX6: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4097 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -502,6 +490,18 @@ ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4097 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4097 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX6: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4097 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -553,17 +553,6 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4097 - ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -579,6 +568,17 @@ ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4097 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -627,6 +627,12 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX7-LABEL: name: global_atomicrmw_add_s64 + ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) + ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] ; GFX6-LABEL: name: global_atomicrmw_add_s64 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -638,12 +644,6 @@ ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX6: $vgpr0_vgpr1 = COPY [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN]] - ; GFX7-LABEL: name: global_atomicrmw_add_s64 - ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) - ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] ; GFX9-LABEL: name: global_atomicrmw_add_s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -673,6 +673,11 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX7-LABEL: name: global_atomicrmw_add_s64_nortn + ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) ; GFX6-LABEL: name: global_atomicrmw_add_s64_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -683,11 +688,6 @@ ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) - ; GFX7-LABEL: name: global_atomicrmw_add_s64_nortn - ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) ; GFX9-LABEL: name: global_atomicrmw_add_s64_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -714,17 +714,6 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX6-LABEL: name: global_atomicrmw_add_s64_offset4095 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) - ; GFX6: $vgpr0_vgpr1 = COPY [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN]] ; GFX7-LABEL: name: global_atomicrmw_add_s64_offset4095 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -741,6 +730,17 @@ ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] + ; GFX6-LABEL: name: global_atomicrmw_add_s64_offset4095 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX6: $vgpr0_vgpr1 = COPY [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN]] ; GFX9-LABEL: name: global_atomicrmw_add_s64_offset4095 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -782,16 +782,6 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX6-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX7-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -807,6 +797,16 @@ ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) + ; GFX6-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX9-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-fadd-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-fadd-local.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-fadd-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-fadd-local.mir @@ -16,13 +16,6 @@ bb.0: liveins: $vgpr0, $vgpr1 - ; GFX8-LABEL: name: atomicrmw_fadd_s32_local - ; GFX8: liveins: $vgpr0, $vgpr1 - ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX8: $m0 = S_MOV_B32 -1 - ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3) - ; GFX8: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] ; GFX9-LABEL: name: atomicrmw_fadd_s32_local ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -36,6 +29,13 @@ ; GFX6: $m0 = S_MOV_B32 -1 ; GFX6: [[ATOMICRMW_FADD:%[0-9]+]]:vgpr_32(s32) = G_ATOMICRMW_FADD [[COPY]](p3), [[COPY1]] :: (load store seq_cst 4, addrspace 3) ; GFX6: $vgpr0 = COPY [[ATOMICRMW_FADD]](s32) + ; GFX8-LABEL: name: atomicrmw_fadd_s32_local + ; GFX8: liveins: $vgpr0, $vgpr1 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: $m0 = S_MOV_B32 -1 + ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3) + ; GFX8: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_ATOMICRMW_FADD %0(p3), %1 :: (load store seq_cst 4, addrspace 3) @@ -52,12 +52,6 @@ bb.0: liveins: $vgpr0, $vgpr1 - ; GFX8-LABEL: name: atomicrmw_fadd_s32_local_noret - ; GFX8: liveins: $vgpr0, $vgpr1 - ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX8: $m0 = S_MOV_B32 -1 - ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3) ; GFX9-LABEL: name: atomicrmw_fadd_s32_local_noret ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -69,6 +63,12 @@ ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GFX6: $m0 = S_MOV_B32 -1 ; GFX6: [[ATOMICRMW_FADD:%[0-9]+]]:vgpr(s32) = G_ATOMICRMW_FADD [[COPY]](p3), [[COPY1]] :: (load store seq_cst 4, addrspace 3) + ; GFX8-LABEL: name: atomicrmw_fadd_s32_local_noret + ; GFX8: liveins: $vgpr0, $vgpr1 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: $m0 = S_MOV_B32 -1 + ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_ATOMICRMW_FADD %0(p3), %1 :: (load store seq_cst 4, addrspace 3) @@ -84,13 +84,6 @@ bb.0: liveins: $vgpr0, $vgpr1 - ; GFX8-LABEL: name: atomicrmw_fadd_s32_local_gep4 - ; GFX8: liveins: $vgpr0, $vgpr1 - ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX8: $m0 = S_MOV_B32 -1 - ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 4, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3) - ; GFX8: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] ; GFX9-LABEL: name: atomicrmw_fadd_s32_local_gep4 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -106,6 +99,13 @@ ; GFX6: $m0 = S_MOV_B32 -1 ; GFX6: [[ATOMICRMW_FADD:%[0-9]+]]:vgpr_32(s32) = G_ATOMICRMW_FADD [[PTR_ADD]](p3), [[COPY1]] :: (load store seq_cst 4, addrspace 3) ; GFX6: $vgpr0 = COPY [[ATOMICRMW_FADD]](s32) + ; GFX8-LABEL: name: atomicrmw_fadd_s32_local_gep4 + ; GFX8: liveins: $vgpr0, $vgpr1 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: $m0 = S_MOV_B32 -1 + ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 4, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3) + ; GFX8: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_CONSTANT i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-fadd-region.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-fadd-region.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-fadd-region.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-fadd-region.mir @@ -16,13 +16,6 @@ bb.0: liveins: $vgpr0, $vgpr1 - ; GFX8-LABEL: name: atomicrmw_fadd_s32_region - ; GFX8: liveins: $vgpr0, $vgpr1 - ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX8: $m0 = S_MOV_B32 -1 - ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) - ; GFX8: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] ; GFX9-LABEL: name: atomicrmw_fadd_s32_region ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -36,6 +29,13 @@ ; GFX6: $m0 = S_MOV_B32 -1 ; GFX6: [[ATOMICRMW_FADD:%[0-9]+]]:vgpr_32(s32) = G_ATOMICRMW_FADD [[COPY]](p2), [[COPY1]] :: (load store seq_cst 4, addrspace 2) ; GFX6: $vgpr0 = COPY [[ATOMICRMW_FADD]](s32) + ; GFX8-LABEL: name: atomicrmw_fadd_s32_region + ; GFX8: liveins: $vgpr0, $vgpr1 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: $m0 = S_MOV_B32 -1 + ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX8: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] %0:vgpr(p2) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_ATOMICRMW_FADD %0(p2), %1 :: (load store seq_cst 4, addrspace 2) @@ -52,12 +52,6 @@ bb.0: liveins: $vgpr0, $vgpr1 - ; GFX8-LABEL: name: atomicrmw_fadd_s32_region_noret - ; GFX8: liveins: $vgpr0, $vgpr1 - ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX8: $m0 = S_MOV_B32 -1 - ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) ; GFX9-LABEL: name: atomicrmw_fadd_s32_region_noret ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -69,6 +63,12 @@ ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GFX6: $m0 = S_MOV_B32 -1 ; GFX6: [[ATOMICRMW_FADD:%[0-9]+]]:vgpr(s32) = G_ATOMICRMW_FADD [[COPY]](p2), [[COPY1]] :: (load store seq_cst 4, addrspace 2) + ; GFX8-LABEL: name: atomicrmw_fadd_s32_region_noret + ; GFX8: liveins: $vgpr0, $vgpr1 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: $m0 = S_MOV_B32 -1 + ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) %0:vgpr(p2) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_ATOMICRMW_FADD %0(p2), %1 :: (load store seq_cst 4, addrspace 2) @@ -84,13 +84,6 @@ bb.0: liveins: $vgpr0, $vgpr1 - ; GFX8-LABEL: name: atomicrmw_fadd_s32_region_gep4 - ; GFX8: liveins: $vgpr0, $vgpr1 - ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX8: $m0 = S_MOV_B32 -1 - ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) - ; GFX8: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] ; GFX9-LABEL: name: atomicrmw_fadd_s32_region_gep4 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -106,6 +99,13 @@ ; GFX6: $m0 = S_MOV_B32 -1 ; GFX6: [[ATOMICRMW_FADD:%[0-9]+]]:vgpr_32(s32) = G_ATOMICRMW_FADD [[PTR_ADD]](p2), [[COPY1]] :: (load store seq_cst 4, addrspace 2) ; GFX6: $vgpr0 = COPY [[ATOMICRMW_FADD]](s32) + ; GFX8-LABEL: name: atomicrmw_fadd_s32_region_gep4 + ; GFX8: liveins: $vgpr0, $vgpr1 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: $m0 = S_MOV_B32 -1 + ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX8: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] %0:vgpr(p2) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_CONSTANT i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-xchg-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-xchg-local.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-xchg-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-xchg-local.mir @@ -15,13 +15,6 @@ bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: atomicrmw_xchg_s32_local - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] ; GFX7-LABEL: name: atomicrmw_xchg_s32_local ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -35,6 +28,13 @@ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: [[DS_WRXCHG_RTN_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_gfx9_]] + ; GFX6-LABEL: name: atomicrmw_xchg_s32_local + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_ATOMICRMW_XCHG %0(p3), %1 :: (load store seq_cst 4, addrspace 3) @@ -51,15 +51,6 @@ bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: atomicrmw_xchg_s32_local_gep4 - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec - ; GFX6: %3:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 %3, [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] ; GFX7-LABEL: name: atomicrmw_xchg_s32_local_gep4 ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -73,6 +64,15 @@ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: [[DS_WRXCHG_RTN_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32_gfx9 [[COPY]], [[COPY1]], 4, 0, implicit $exec :: (load store seq_cst 4, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_gfx9_]] + ; GFX6-LABEL: name: atomicrmw_xchg_s32_local_gep4 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec + ; GFX6: %3:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 %3, [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_CONSTANT i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-xchg-region.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-xchg-region.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-xchg-region.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-xchg-region.mir @@ -15,13 +15,6 @@ bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: atomicrmw_xchg_s32_region - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) - ; GFX6: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] ; GFX7-LABEL: name: atomicrmw_xchg_s32_region ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -35,6 +28,13 @@ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) ; GFX9: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] + ; GFX6-LABEL: name: atomicrmw_xchg_s32_region + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] %0:vgpr(p2) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_ATOMICRMW_XCHG %0(p2), %1 :: (load store seq_cst 4, addrspace 2) @@ -51,15 +51,6 @@ bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: atomicrmw_xchg_s32_region_gep4 - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec - ; GFX6: %3:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 %3, [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) - ; GFX6: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] ; GFX7-LABEL: name: atomicrmw_xchg_s32_region_gep4 ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -73,6 +64,15 @@ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) ; GFX9: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] + ; GFX6-LABEL: name: atomicrmw_xchg_s32_region_gep4 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec + ; GFX6: %3:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 %3, [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] %0:vgpr(p2) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_CONSTANT i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.mir @@ -381,6 +381,12 @@ body: | bb.0: liveins: $sgpr0, $sgpr1 + ; GCN-LABEL: name: trunc_sgpr_s32_to_s1_use + ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GCN: $scc = COPY [[COPY]] + ; GCN: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY]], [[COPY1]], implicit $scc + ; GCN: S_ENDPGM 0, implicit [[S_CSELECT_B32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 %2:sgpr(s1) = G_TRUNC %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.v2s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.v2s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.v2s16.mir @@ -12,15 +12,6 @@ body: | bb.0: liveins: $sgpr0_sgpr1 - ; GFX6-LABEL: name: trunc_sgpr_v2s32_to_v2s16 - ; GFX6: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX6: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def $scc - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GFX6: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc - ; GFX6: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_LSHL_B32_]], [[S_AND_B32_]], implicit-def $scc - ; GFX6: S_ENDPGM 0, implicit [[S_OR_B32_]] ; GFX8-LABEL: name: trunc_sgpr_v2s32_to_v2s16 ; GFX8: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 @@ -30,6 +21,15 @@ ; GFX8: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; GFX8: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_LSHL_B32_]], [[S_AND_B32_]], implicit-def $scc ; GFX8: S_ENDPGM 0, implicit [[S_OR_B32_]] + ; GFX6-LABEL: name: trunc_sgpr_v2s32_to_v2s16 + ; GFX6: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX6: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def $scc + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX6: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc + ; GFX6: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_LSHL_B32_]], [[S_AND_B32_]], implicit-def $scc + ; GFX6: S_ENDPGM 0, implicit [[S_OR_B32_]] %0:sgpr(<2 x s32>) = COPY $sgpr0_sgpr1 %1:sgpr(<2 x s16>) = G_TRUNC %0 S_ENDPGM 0, implicit %1 @@ -44,6 +44,12 @@ body: | bb.0: liveins: $vgpr0_vgpr1 + ; GFX8-LABEL: name: trunc_vgpr_v2s32_to_v2s16 + ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8: [[V_MOV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_MOV_B32_sdwa 0, [[COPY2]], 0, 5, 2, 4, implicit $exec, implicit [[COPY1]](tied-def 0) + ; GFX8: S_ENDPGM 0, implicit [[V_MOV_B32_sdwa]] ; GFX6-LABEL: name: trunc_vgpr_v2s32_to_v2s16 ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 @@ -53,12 +59,6 @@ ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec ; GFX6: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_LSHLREV_B32_e64_]], [[V_AND_B32_e64_]], implicit $exec ; GFX6: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] - ; GFX8-LABEL: name: trunc_vgpr_v2s32_to_v2s16 - ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX8: [[V_MOV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_MOV_B32_sdwa 0, [[COPY2]], 0, 5, 2, 4, implicit $exec, implicit [[COPY1]](tied-def 0) - ; GFX8: S_ENDPGM 0, implicit [[V_MOV_B32_sdwa]] %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s16>) = G_TRUNC %0 S_ENDPGM 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll @@ -9,7 +9,7 @@ ; CHECK: liveins: $sgpr0 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), 12, [[COPY]](s32) - ; CHECK: S_ENDPGM + ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.s.sendmsg(i32 12, i32 %m0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll @@ -326,7 +326,7 @@ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[USHLSAT:%[0-9]+]]:_(s16) = G_USHLSAT [[TRUNC]], [[TRUNC1]] + ; CHECK: [[USHLSAT:%[0-9]+]]:_(s16) = G_USHLSAT [[TRUNC]], [[TRUNC1]](s16) ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[USHLSAT]](s16) ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] @@ -343,7 +343,7 @@ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[USHLSAT:%[0-9]+]]:_(s32) = G_USHLSAT [[COPY]], [[COPY1]] + ; CHECK: [[USHLSAT:%[0-9]+]]:_(s32) = G_USHLSAT [[COPY]], [[COPY1]](s32) ; CHECK: $vgpr0 = COPY [[USHLSAT]](s32) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 @@ -363,7 +363,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[USHLSAT:%[0-9]+]]:_(s64) = G_USHLSAT [[MV]], [[MV1]] + ; CHECK: [[USHLSAT:%[0-9]+]]:_(s64) = G_USHLSAT [[MV]], [[MV1]](s64) ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[USHLSAT]](s64) ; CHECK: $vgpr0 = COPY [[UV]](s32) ; CHECK: $vgpr1 = COPY [[UV1]](s32) @@ -385,7 +385,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[USHLSAT:%[0-9]+]]:_(<2 x s32>) = G_USHLSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; CHECK: [[USHLSAT:%[0-9]+]]:_(<2 x s32>) = G_USHLSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]](<2 x s32>) ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[USHLSAT]](<2 x s32>) ; CHECK: $vgpr0 = COPY [[UV]](s32) ; CHECK: $vgpr1 = COPY [[UV1]](s32) @@ -405,7 +405,7 @@ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[SSHLSAT:%[0-9]+]]:_(s16) = G_SSHLSAT [[TRUNC]], [[TRUNC1]] + ; CHECK: [[SSHLSAT:%[0-9]+]]:_(s16) = G_SSHLSAT [[TRUNC]], [[TRUNC1]](s16) ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SSHLSAT]](s16) ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] @@ -422,7 +422,7 @@ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[SSHLSAT:%[0-9]+]]:_(s32) = G_SSHLSAT [[COPY]], [[COPY1]] + ; CHECK: [[SSHLSAT:%[0-9]+]]:_(s32) = G_SSHLSAT [[COPY]], [[COPY1]](s32) ; CHECK: $vgpr0 = COPY [[SSHLSAT]](s32) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 @@ -442,7 +442,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[SSHLSAT:%[0-9]+]]:_(s64) = G_SSHLSAT [[MV]], [[MV1]] + ; CHECK: [[SSHLSAT:%[0-9]+]]:_(s64) = G_SSHLSAT [[MV]], [[MV1]](s64) ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SSHLSAT]](s64) ; CHECK: $vgpr0 = COPY [[UV]](s32) ; CHECK: $vgpr1 = COPY [[UV1]](s32) @@ -464,7 +464,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[SSHLSAT:%[0-9]+]]:_(<2 x s32>) = G_SSHLSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; CHECK: [[SSHLSAT:%[0-9]+]]:_(<2 x s32>) = G_SSHLSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]](<2 x s32>) ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SSHLSAT]](<2 x s32>) ; CHECK: $vgpr0 = COPY [[UV]](s32) ; CHECK: $vgpr1 = COPY [[UV1]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s32.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s32.mir @@ -16,6 +16,13 @@ bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX103-LABEL: name: test_fmad_s32_flush + ; GFX103: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX103: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX103: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]] + ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]] + ; GFX103: $vgpr0 = COPY [[FADD]](s32) ; GFX6-LABEL: name: test_fmad_s32_flush ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 @@ -34,13 +41,6 @@ ; GFX101: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX101: [[FMAD:%[0-9]+]]:_(s32) = G_FMAD [[COPY]], [[COPY1]], [[COPY2]] ; GFX101: $vgpr0 = COPY [[FMAD]](s32) - ; GFX103-LABEL: name: test_fmad_s32_flush - ; GFX103: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX103: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX103: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]] - ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]] - ; GFX103: $vgpr0 = COPY [[FADD]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -59,6 +59,13 @@ bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX103-LABEL: name: test_fmad_s32_flags_flush + ; GFX103: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX103: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX103: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[COPY1]] + ; GFX103: [[FADD:%[0-9]+]]:_(s32) = nnan G_FADD [[FMUL]], [[COPY2]] + ; GFX103: $vgpr0 = COPY [[FADD]](s32) ; GFX6-LABEL: name: test_fmad_s32_flags_flush ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 @@ -77,13 +84,6 @@ ; GFX101: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX101: [[FMAD:%[0-9]+]]:_(s32) = nnan G_FMAD [[COPY]], [[COPY1]], [[COPY2]] ; GFX101: $vgpr0 = COPY [[FMAD]](s32) - ; GFX103-LABEL: name: test_fmad_s32_flags_flush - ; GFX103: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX103: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX103: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[COPY1]] - ; GFX103: [[FADD:%[0-9]+]]:_(s32) = nnan G_FADD [[FMUL]], [[COPY2]] - ; GFX103: $vgpr0 = COPY [[FADD]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -102,6 +102,19 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 + ; GFX103-LABEL: name: test_fmad_v2s32_flush + ; GFX103: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX103: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GFX103: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5 + ; GFX103: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX103: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GFX103: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>) + ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV2]] + ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV4]] + ; GFX103: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV3]] + ; GFX103: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV5]] + ; GFX103: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32) + ; GFX103: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX6-LABEL: name: test_fmad_v2s32_flush ; GFX6: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; GFX6: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 @@ -135,19 +148,6 @@ ; GFX101: [[FMAD1:%[0-9]+]]:_(s32) = G_FMAD [[UV1]], [[UV3]], [[UV5]] ; GFX101: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMAD]](s32), [[FMAD1]](s32) ; GFX101: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) - ; GFX103-LABEL: name: test_fmad_v2s32_flush - ; GFX103: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX103: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 - ; GFX103: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5 - ; GFX103: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX103: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX103: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>) - ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV2]] - ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV4]] - ; GFX103: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV3]] - ; GFX103: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV5]] - ; GFX103: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32) - ; GFX103: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 %2:_(<2 x s32>) = COPY $vgpr4_vgpr5 @@ -166,6 +166,21 @@ bb.0: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8 + ; GFX103-LABEL: name: test_fmad_v3s32_flush + ; GFX103: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX103: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX103: [[COPY2:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr6_vgpr7_vgpr8 + ; GFX103: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) + ; GFX103: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) + ; GFX103: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<3 x s32>) + ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV3]] + ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV6]] + ; GFX103: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV4]] + ; GFX103: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV7]] + ; GFX103: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV5]] + ; GFX103: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV8]] + ; GFX103: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32) + ; GFX103: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; GFX6-LABEL: name: test_fmad_v3s32_flush ; GFX6: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX6: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 @@ -202,21 +217,6 @@ ; GFX101: [[FMAD2:%[0-9]+]]:_(s32) = G_FMAD [[UV2]], [[UV5]], [[UV8]] ; GFX101: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMAD]](s32), [[FMAD1]](s32), [[FMAD2]](s32) ; GFX101: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) - ; GFX103-LABEL: name: test_fmad_v3s32_flush - ; GFX103: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 - ; GFX103: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 - ; GFX103: [[COPY2:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr6_vgpr7_vgpr8 - ; GFX103: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; GFX103: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) - ; GFX103: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<3 x s32>) - ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV3]] - ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV6]] - ; GFX103: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV4]] - ; GFX103: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV7]] - ; GFX103: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV5]] - ; GFX103: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV8]] - ; GFX103: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32) - ; GFX103: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 %2:_(<3 x s32>) = COPY $vgpr6_vgpr7_vgpr8 @@ -235,6 +235,23 @@ bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX103-LABEL: name: test_fmad_v4s32_flush + ; GFX103: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX103: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX103: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX103: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; GFX103: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>) + ; GFX103: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<4 x s32>) + ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV4]] + ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV8]] + ; GFX103: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV5]] + ; GFX103: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV9]] + ; GFX103: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV6]] + ; GFX103: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV10]] + ; GFX103: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UV3]], [[UV7]] + ; GFX103: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UV11]] + ; GFX103: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32), [[FADD3]](s32) + ; GFX103: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; GFX6-LABEL: name: test_fmad_v4s32_flush ; GFX6: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX6: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 @@ -274,23 +291,6 @@ ; GFX101: [[FMAD3:%[0-9]+]]:_(s32) = G_FMAD [[UV3]], [[UV7]], [[UV11]] ; GFX101: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FMAD]](s32), [[FMAD1]](s32), [[FMAD2]](s32), [[FMAD3]](s32) ; GFX101: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) - ; GFX103-LABEL: name: test_fmad_v4s32_flush - ; GFX103: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX103: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX103: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX103: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) - ; GFX103: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>) - ; GFX103: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<4 x s32>) - ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV4]] - ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV8]] - ; GFX103: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV5]] - ; GFX103: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV9]] - ; GFX103: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV6]] - ; GFX103: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV10]] - ; GFX103: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UV3]], [[UV7]] - ; GFX103: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UV11]] - ; GFX103: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32), [[FADD3]](s32) - ; GFX103: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(<4 x s32>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 %2:_(<4 x s32>) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 @@ -309,6 +309,13 @@ bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX103-LABEL: name: test_fmad_s32_denorm + ; GFX103: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX103: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX103: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]] + ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]] + ; GFX103: $vgpr0 = COPY [[FADD]](s32) ; GFX6-LABEL: name: test_fmad_s32_denorm ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 @@ -330,13 +337,6 @@ ; GFX101: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]] ; GFX101: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]] ; GFX101: $vgpr0 = COPY [[FADD]](s32) - ; GFX103-LABEL: name: test_fmad_s32_denorm - ; GFX103: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX103: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX103: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]] - ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]] - ; GFX103: $vgpr0 = COPY [[FADD]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -355,6 +355,13 @@ bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX103-LABEL: name: test_fmad_s32_flags_denorm + ; GFX103: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX103: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX103: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[COPY1]] + ; GFX103: [[FADD:%[0-9]+]]:_(s32) = nnan G_FADD [[FMUL]], [[COPY2]] + ; GFX103: $vgpr0 = COPY [[FADD]](s32) ; GFX6-LABEL: name: test_fmad_s32_flags_denorm ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 @@ -376,13 +383,6 @@ ; GFX101: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[COPY1]] ; GFX101: [[FADD:%[0-9]+]]:_(s32) = nnan G_FADD [[FMUL]], [[COPY2]] ; GFX101: $vgpr0 = COPY [[FADD]](s32) - ; GFX103-LABEL: name: test_fmad_s32_flags_denorm - ; GFX103: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX103: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX103: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[COPY1]] - ; GFX103: [[FADD:%[0-9]+]]:_(s32) = nnan G_FADD [[FMUL]], [[COPY2]] - ; GFX103: $vgpr0 = COPY [[FADD]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -401,6 +401,19 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 + ; GFX103-LABEL: name: test_fmad_v2s32_denorm + ; GFX103: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX103: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GFX103: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5 + ; GFX103: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX103: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GFX103: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>) + ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV2]] + ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV4]] + ; GFX103: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV3]] + ; GFX103: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV5]] + ; GFX103: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32) + ; GFX103: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX6-LABEL: name: test_fmad_v2s32_denorm ; GFX6: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; GFX6: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 @@ -440,19 +453,6 @@ ; GFX101: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV5]] ; GFX101: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32) ; GFX101: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) - ; GFX103-LABEL: name: test_fmad_v2s32_denorm - ; GFX103: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX103: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 - ; GFX103: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5 - ; GFX103: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX103: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX103: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>) - ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV2]] - ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV4]] - ; GFX103: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV3]] - ; GFX103: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV5]] - ; GFX103: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32) - ; GFX103: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 %2:_(<2 x s32>) = COPY $vgpr4_vgpr5 @@ -471,6 +471,21 @@ bb.0: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8 + ; GFX103-LABEL: name: test_fmad_v3s32_denorm + ; GFX103: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX103: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX103: [[COPY2:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr6_vgpr7_vgpr8 + ; GFX103: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) + ; GFX103: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) + ; GFX103: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<3 x s32>) + ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV3]] + ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV6]] + ; GFX103: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV4]] + ; GFX103: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV7]] + ; GFX103: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV5]] + ; GFX103: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV8]] + ; GFX103: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32) + ; GFX103: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; GFX6-LABEL: name: test_fmad_v3s32_denorm ; GFX6: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX6: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 @@ -516,21 +531,6 @@ ; GFX101: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV8]] ; GFX101: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32) ; GFX101: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) - ; GFX103-LABEL: name: test_fmad_v3s32_denorm - ; GFX103: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 - ; GFX103: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 - ; GFX103: [[COPY2:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr6_vgpr7_vgpr8 - ; GFX103: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; GFX103: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) - ; GFX103: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<3 x s32>) - ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV3]] - ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV6]] - ; GFX103: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV4]] - ; GFX103: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV7]] - ; GFX103: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV5]] - ; GFX103: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV8]] - ; GFX103: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32) - ; GFX103: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 %2:_(<3 x s32>) = COPY $vgpr6_vgpr7_vgpr8 @@ -549,6 +549,23 @@ bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX103-LABEL: name: test_fmad_v4s32_denorm + ; GFX103: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX103: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX103: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX103: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; GFX103: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>) + ; GFX103: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<4 x s32>) + ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV4]] + ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV8]] + ; GFX103: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV5]] + ; GFX103: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV9]] + ; GFX103: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV6]] + ; GFX103: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV10]] + ; GFX103: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UV3]], [[UV7]] + ; GFX103: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UV11]] + ; GFX103: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32), [[FADD3]](s32) + ; GFX103: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; GFX6-LABEL: name: test_fmad_v4s32_denorm ; GFX6: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX6: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 @@ -600,23 +617,6 @@ ; GFX101: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UV11]] ; GFX101: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32), [[FADD3]](s32) ; GFX101: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) - ; GFX103-LABEL: name: test_fmad_v4s32_denorm - ; GFX103: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX103: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX103: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX103: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) - ; GFX103: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>) - ; GFX103: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<4 x s32>) - ; GFX103: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[UV4]] - ; GFX103: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV8]] - ; GFX103: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV5]] - ; GFX103: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[UV9]] - ; GFX103: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV6]] - ; GFX103: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[UV10]] - ; GFX103: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[UV3]], [[UV7]] - ; GFX103: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[UV11]] - ; GFX103: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32), [[FADD3]](s32) - ; GFX103: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(<4 x s32>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 %2:_(<4 x s32>) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir @@ -23,8 +23,8 @@ ; CHECK-LABEL: name: test_merge_s32_s32_v2s32 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; CHECK: [[MV:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32) - ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](<2 x s32>) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(s32) = G_CONSTANT i32 0 %1:_(s32) = G_CONSTANT i32 1 %2:_(<2 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32) @@ -39,8 +39,8 @@ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; CHECK: [[MV:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32) - ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](<3 x s32>) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(s32) = G_CONSTANT i32 0 %1:_(s32) = G_CONSTANT i32 1 %2:_(s32) = G_CONSTANT i32 2 @@ -55,8 +55,8 @@ ; CHECK-LABEL: name: test_merge_s64_s64_s128 ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK: [[MV:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64) - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](<2 x s64>) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(s64) = G_CONSTANT i64 0 %1:_(s64) = G_CONSTANT i64 1 %2:_(<2 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64) @@ -72,8 +72,8 @@ ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 - ; CHECK: [[MV:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64), [[C2]](s64), [[C3]](s64) - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[MV]](<4 x s64>) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64), [[C2]](s64), [[C3]](s64) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) %0:_(s64) = G_CONSTANT i64 0 %1:_(s64) = G_CONSTANT i64 1 %2:_(s64) = G_CONSTANT i64 2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -406,6 +406,7 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -713,6 +714,7 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -1034,6 +1036,7 @@ ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1603,6 +1606,7 @@ ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -408,6 +408,7 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -430,6 +431,7 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: global_atomic_inc v2, v[2:3], v4, off glc +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -976,6 +978,7 @@ ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -999,6 +1002,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_atomic_inc_x2 v[2:3], v[2:3], v[4:5], off glc +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm @@ -1182,6 +1186,7 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -1204,6 +1209,7 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: flat_atomic_inc v2, v[2:3], v4 glc +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: s_endpgm @@ -1457,6 +1463,7 @@ ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1480,6 +1487,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm @@ -1574,12 +1582,12 @@ ; VI-LABEL: nocse_lds_atomic_inc_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: ds_inc_rtn_u32 v4, v1, v0 ; VI-NEXT: ds_inc_rtn_u32 v5, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1594,11 +1602,11 @@ ; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: ds_inc_rtn_u32 v4, v1, v0 ; GFX9-NEXT: ds_inc_rtn_u32 v5, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -239,15 +239,17 @@ ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0xb8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_and_b32 s2, 1, s5 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX8-NEXT: s_nop 3 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_nop 2 ; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -256,39 +258,39 @@ ; GFX10_W32-LABEL: test_div_fmas_f32: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x4 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dword s7, s[0:1], 0xb8 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s4 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10_W32-NEXT: v_div_fmas_f32 v2, s5, v0, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s7 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x4 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dword s7, s[0:1], 0xb8 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s4 -; GFX10_W64-NEXT: v_div_fmas_f32 v2, s5, v0, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s7 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) @@ -320,6 +322,7 @@ ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70 ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x94 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0xb8 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -336,35 +339,35 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v2, 1.0, s4, v0 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v2, 1.0, s4, v0 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) @@ -396,6 +399,7 @@ ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x34 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x58 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -412,35 +416,35 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x58 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x58 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x34 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, 1.0, v0 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x58 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x58 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x34 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, 1.0, v0 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) @@ -472,6 +476,7 @@ ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0xb8 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -488,35 +493,35 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, 1.0 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, 1.0 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) @@ -547,58 +552,58 @@ ; ; GFX8-LABEL: test_div_fmas_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: s_and_b32 s2, 1, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX8-NEXT: s_and_b32 s0, 1, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_nop 3 ; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10_W32-LABEL: test_div_fmas_f64: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x1 -; GFX10_W32-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX10_W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10_W32-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s8, 1, s8 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s4 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s6 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s5 -; GFX10_W32-NEXT: v_mov_b32_e32 v3, s7 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 -; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] -; GFX10_W32-NEXT: v_mov_b32_e32 v3, s1 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s0 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s8 +; GFX10_W32-NEXT: v_mov_b32_e32 v2, s10 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s9 +; GFX10_W32-NEXT: v_mov_b32_e32 v3, s11 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] +; GFX10_W32-NEXT: v_mov_b32_e32 v2, s4 +; GFX10_W32-NEXT: v_mov_b32_e32 v3, s5 ; GFX10_W32-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f64: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x1 -; GFX10_W64-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX10_W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10_W64-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s8, 1, s8 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX10_W64-NEXT: v_mov_b32_e32 v2, s6 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s5 -; GFX10_W64-NEXT: v_mov_b32_e32 v3, s7 -; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] -; GFX10_W64-NEXT: v_mov_b32_e32 v3, s1 -; GFX10_W64-NEXT: v_mov_b32_e32 v2, s0 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s8 +; GFX10_W64-NEXT: v_mov_b32_e32 v2, s10 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s9 +; GFX10_W64-NEXT: v_mov_b32_e32 v3, s11 +; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] +; GFX10_W64-NEXT: v_mov_b32_e32 v2, s4 +; GFX10_W64-NEXT: v_mov_b32_e32 v3, s5 ; GFX10_W64-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10_W64-NEXT: s_endpgm %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) @@ -629,6 +634,7 @@ ; GFX8-LABEL: test_div_fmas_f32_cond_to_vcc: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cmp_eq_u32 s7, 0 @@ -649,18 +655,18 @@ ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x1 ; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; @@ -668,17 +674,17 @@ ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x1 ; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W64-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %cmp = icmp eq i32 %i, 0 @@ -725,35 +731,35 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, 0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 -; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 -; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) @@ -799,35 +805,35 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, 1 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 -; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, 1 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 -; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) @@ -877,9 +883,9 @@ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 8, v1 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_dword v1, v[1:2] -; GFX8-NEXT: flat_load_dword v2, v[3:4] -; GFX8-NEXT: flat_load_dword v3, v[5:6] +; GFX8-NEXT: flat_load_dword v7, v[1:2] +; GFX8-NEXT: flat_load_dword v8, v[3:4] +; GFX8-NEXT: flat_load_dword v9, v[5:6] ; GFX8-NEXT: s_add_u32 s0, s4, 8 ; GFX8-NEXT: s_addc_u32 s1, s5, 0 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 @@ -888,11 +894,10 @@ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 ; GFX8-NEXT: s_and_b64 vcc, vcc, s[2:3] -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_div_fmas_f32 v2, v7, v8, v9 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -918,14 +923,14 @@ ; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo ; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10_W32-NEXT: s_clause 0x2 -; GFX10_W32-NEXT: global_load_dword v1, v[1:2], off -; GFX10_W32-NEXT: global_load_dword v2, v[3:4], off offset:-4 -; GFX10_W32-NEXT: global_load_dword v3, v[3:4], off -; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s2 -; GFX10_W32-NEXT: s_waitcnt vmcnt(0) -; GFX10_W32-NEXT: v_div_fmas_f32 v2, v1, v2, v3 +; GFX10_W32-NEXT: global_load_dword v5, v[1:2], off +; GFX10_W32-NEXT: global_load_dword v6, v[3:4], off offset:-4 +; GFX10_W32-NEXT: global_load_dword v7, v[3:4], off ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s2 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W32-NEXT: s_waitcnt vmcnt(0) +; GFX10_W32-NEXT: v_div_fmas_f32 v2, v5, v6, v7 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W32-NEXT: s_endpgm ; @@ -950,14 +955,14 @@ ; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v4, vcc, 0, v2, vcc ; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10_W64-NEXT: s_clause 0x2 -; GFX10_W64-NEXT: global_load_dword v1, v[1:2], off -; GFX10_W64-NEXT: global_load_dword v2, v[3:4], off offset:-4 -; GFX10_W64-NEXT: global_load_dword v3, v[3:4], off -; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[2:3] -; GFX10_W64-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-NEXT: v_div_fmas_f32 v2, v1, v2, v3 +; GFX10_W64-NEXT: global_load_dword v5, v[1:2], off +; GFX10_W64-NEXT: global_load_dword v6, v[3:4], off offset:-4 +; GFX10_W64-NEXT: global_load_dword v7, v[3:4], off ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 +; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[2:3] ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX10_W64-NEXT: s_waitcnt vmcnt(0) +; GFX10_W64-NEXT: v_div_fmas_f32 v2, v5, v6, v7 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off ; GFX10_W64-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1015,11 +1020,11 @@ ; ; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x4c ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] -; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 @@ -1031,16 +1036,17 @@ ; GFX8-NEXT: s_cbranch_execz BB13_2 ; GFX8-NEXT: ; %bb.1: ; %bb ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: BB13_2: ; %exit ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_add_u32 s0, s4, 8 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 -; GFX8-NEXT: s_and_b32 s2, 1, s2 +; GFX8-NEXT: s_add_u32 s0, s2, 8 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: s_and_b32 s2, 1, s4 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX8-NEXT: s_nop 3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1069,6 +1075,7 @@ ; GFX10_W32-NEXT: s_cbranch_execz BB13_2 ; GFX10_W32-NEXT: ; %bb.1: ; %bb ; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 +; GFX10_W32-NEXT: s_nop 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) @@ -1107,6 +1114,7 @@ ; GFX10_W64-NEXT: s_cbranch_execz BB13_2 ; GFX10_W64-NEXT: ; %bb.1: ; %bb ; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 +; GFX10_W64-NEXT: s_nop 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -35,12 +35,12 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v5, v5, v4 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -58,12 +58,12 @@ ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: global_load_dword v1, v[2:3], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v1, v1, v0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_div_scale_f32 v2, s2, v5, v5, v4 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -110,12 +110,12 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v1, v0 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v4, v5, v4 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -133,12 +133,12 @@ ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: global_load_dword v1, v[2:3], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v1, v0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_div_scale_f32 v2, s2, v4, v5, v4 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -190,12 +190,12 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[6:7], v[6:7], v[4:5] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -214,12 +214,12 @@ ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[6:7], v[6:7], v[4:5] ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -271,12 +271,12 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[4:5], v[6:7], v[4:5] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -295,12 +295,12 @@ ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[4:5], v[6:7], v[4:5] ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -338,7 +338,7 @@ ; GFX8-LABEL: test_div_scale_f32_scalar_num_1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x54 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x54 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -348,7 +348,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s0 +; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -406,7 +406,7 @@ ; GFX8-LABEL: test_div_scale_f32_scalar_num_2: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -416,7 +416,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, v0, s0 +; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s2, v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -474,7 +474,7 @@ ; GFX8-LABEL: test_div_scale_f32_scalar_den_1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -484,7 +484,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, v0 +; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s2, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -542,7 +542,7 @@ ; GFX8-LABEL: test_div_scale_f32_scalar_den_2: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -552,7 +552,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, s0, v0 +; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -611,7 +611,7 @@ ; GFX8-LABEL: test_div_scale_f64_scalar_num_1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -623,7 +623,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[2:3] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -680,7 +680,7 @@ ; GFX8-LABEL: test_div_scale_f64_scalar_num_2: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -692,7 +692,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[2:3], v[0:1], s[2:3] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -749,7 +749,7 @@ ; GFX8-LABEL: test_div_scale_f64_scalar_den_1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -761,7 +761,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[2:3], s[2:3], v[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -818,7 +818,7 @@ ; GFX8-LABEL: test_div_scale_f64_scalar_den_2: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -830,7 +830,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[2:3], v[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -882,6 +882,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -894,14 +895,14 @@ ; GFX10-LABEL: test_div_scale_f32_all_scalar_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, s3, s3, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_div_scale_f32 v2, s0, s5, s5, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) @@ -928,6 +929,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -940,14 +942,14 @@ ; GFX10-LABEL: test_div_scale_f32_all_scalar_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, s2, s3, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_div_scale_f32 v2, s0, s4, s5, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) @@ -975,6 +977,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 @@ -988,14 +991,14 @@ ; GFX10-LABEL: test_div_scale_f64_all_scalar_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x74 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3] -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[6:7], s[6:7], s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) @@ -1023,6 +1026,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 @@ -1036,14 +1040,14 @@ ; GFX10-LABEL: test_div_scale_f64_all_scalar_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x74 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3] -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[4:5], s[6:7], s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) @@ -1213,12 +1217,12 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v5, v5, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1238,12 +1242,12 @@ ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: global_load_dword v1, v[2:3], off +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v1, v1, v0 +; GFX10-NEXT: v_div_scale_f32 v2, s2, v5, v5, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -1295,11 +1299,11 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v5 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1319,11 +1323,11 @@ ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: global_load_dword v1, v[2:3], off +; GFX10-NEXT: global_load_dword v7, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX10-NEXT: v_div_scale_f32 v2, s2, v1, v1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v5 +; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v7 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dword v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -82,6 +82,7 @@ ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -8,10 +8,10 @@ ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 @@ -21,13 +21,13 @@ ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; encoding: [0x00,0x01,0x00,0xf4,0x2c,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; encoding: [0x80,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; encoding: [0x02,0x02,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; encoding: [0x01,0x02,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; encoding: [0x04,0x02,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x04,0x7e,0x02,0x01,0x08,0x11] ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x00,0x02,0x7d,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll @@ -4,6 +4,21 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -19,7 +34,12 @@ ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret half %val +} + +define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 @@ -31,14 +51,9 @@ ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) - ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) - ret half %val -} - -define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; UNPACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -65,21 +80,6 @@ ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; PACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset - ; PACKED: bb.1 (%ir-block.0): - ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <2 x half> %val } @@ -91,6 +91,24 @@ ; } define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 + ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 + ; PACKED: $vgpr0 = COPY [[COPY7]] + ; PACKED: $vgpr1 = COPY [[COPY8]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; UNPACKED-LABEL: name: struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -127,29 +145,27 @@ ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset + %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret <4 x half> %val +} + +define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 ; PACKED: bb.1 (%ir-block.0): - ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "TargetCustom7", align 1, addrspace 4) - ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 - ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 - ; PACKED: $vgpr0 = COPY [[COPY7]] - ; PACKED: $vgpr1 = COPY [[COPY8]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) - ret <4 x half> %val -} - -define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 @@ -166,27 +182,57 @@ ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 - ; PACKED: bb.1 (%ir-block.0): - ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 - ; PACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7", align 1, addrspace 4) - ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 78, i32 0) ret half %val } define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { + ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: successors: %bb.2(0x80000000) + ; PACKED: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; PACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; PACKED: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; PACKED: bb.2: + ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec + ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; PACKED: bb.3: + ; PACKED: successors: %bb.4(0x80000000) + ; PACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; PACKED: bb.4: + ; PACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 + ; PACKED: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 + ; PACKED: $vgpr0 = COPY [[COPY11]] + ; PACKED: $vgpr1 = COPY [[COPY12]] + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; UNPACKED-LABEL: name: struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: successors: %bb.2(0x80000000) @@ -251,72 +297,11 @@ ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 - ; PACKED-LABEL: name: struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset - ; PACKED: bb.1 (%ir-block.0): - ; PACKED: successors: %bb.2(0x80000000) - ; PACKED: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; PACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; PACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; PACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; PACKED: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; PACKED: bb.2: - ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec - ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec - ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "TargetCustom7", align 1, addrspace 4) - ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; PACKED: bb.3: - ; PACKED: successors: %bb.4(0x80000000) - ; PACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; PACKED: bb.4: - ; PACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 - ; PACKED: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 - ; PACKED: $vgpr0 = COPY [[COPY11]] - ; PACKED: $vgpr1 = COPY [[COPY12]] - ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %val = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret <4 x half> %val } define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) { - ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 - ; UNPACKED: bb.1 (%ir-block.0): - ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; UNPACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; UNPACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7" + 4095, align 1, addrspace 4) - ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] - ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 ; PACKED: bb.1 (%ir-block.0): ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 @@ -332,6 +317,21 @@ ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7" + 4095, align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "TargetCustom7" + 4095, align 1, addrspace 4) + ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) ret half %val diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -950,22 +950,22 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, ; GFX6-LABEL: simplify_bfe_u32_multi_use_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s0, 63 -; GFX6-NEXT: s_bfe_u32 s1, s0, 0x20002 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; GFX6-NEXT: s_endpgm +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s0, s0, 63 +; GFX6-NEXT: s_bfe_u32 s1, s0, 0x20002 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_endpgm i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { %src = load i32, i32 addrspace(1)* %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -6,10 +6,10 @@ ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -20,15 +20,15 @@ ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -17,56 +17,56 @@ ; ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 11, v0 -; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v[2:3], off offset:-10 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[2:3], off offset:-9 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[2:3], off offset:-8 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-7 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-6 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-5 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-4 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off offset:-3 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-2 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off offset:-1 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xff -; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8 -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v3 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v11, v13, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v12, v3 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v5, v6, v3, v7 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v9 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v3, v10, v3, v11 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v4 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v7 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v3, v8, v2 -; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 11, v0 +; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v15, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v14, v[2:3], off offset:-10 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v13, v[2:3], off offset:-9 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off offset:-8 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-7 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off offset:-6 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-5 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-4 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-3 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-2 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[2:3], off offset:-1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v2, s5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v13 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, s4, v12 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v15, s4, v2 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v4, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 24, v12 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v10, v11, v0, v10 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v7, v7, v0, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v6 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v10, v9, v8 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v7, v5, v4 +; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1: ; GFX7-UNALIGNED: ; %bb.0: @@ -155,30 +155,30 @@ ; ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 10, v0 -; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off offset:-8 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[2:3], off offset:-6 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off offset:-4 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-2 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v4, v3, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v6, v3, v2 -; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 10, v0 +; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v9, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v8, v[2:3], off offset:-8 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v7, v[2:3], off offset:-6 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-4 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off offset:-2 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v8 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v6, v2 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v7, v2, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v9, s4, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v5, v2, v3 +; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX7-UNALIGNED: ; %bb.0: @@ -192,36 +192,36 @@ ; ; GFX7-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX7-NOUNALIGNED: ; %bb.0: -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s6, 0 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NOUNALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:10 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v4 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v5 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v6 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v2 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v3, v4 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v5, v6 -; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s6, 0 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NOUNALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:10 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v4 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v6 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v2 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v3, v4 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v5, v6 +; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 2 ret <3 x i32> %load } @@ -398,101 +398,101 @@ ; ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 1 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 3 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 5 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 7 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v15, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v14, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v17, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v18, v[4:5], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v19, v[6:7], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[8:9], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[10:11], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[12:13], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[14:15], off -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 9 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 10 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 11 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s1 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[4:5], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[6:7], off -; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 8 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, s1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v18 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, s0, v19 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v16, s0, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v7 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v10, v5 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v11, v5 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v8, v5, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v0, v1, v7 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v3, v5 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v4, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v12, v5, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v0, v1, v2 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 1 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 3 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 5 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 7 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v15, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v14, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v23, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v22, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v21, v[4:5], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v20, v[6:7], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v19, v[8:9], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v18, v[10:11], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v17, v[12:13], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[14:15], off +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 9 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 10 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 11 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s1 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[4:5], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[6:7], off +; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, s1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v21 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v20 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v23, s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v4 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v17, v2 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v16, v2 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v19, v2, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v0, v1, v4 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v9, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v11, v2, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v8, v2 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v0, v1, v2 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog ; ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX7-UNALIGNED: ; %bb.0: @@ -588,52 +588,52 @@ ; ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 10 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s1 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[4:5], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v[6:7], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[8:9], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[10:11], off -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v3, v6 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v2, v6, v0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v5, v6 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v4, v6, v0 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 10 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v17, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v16, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v15, v[4:5], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v14, v[6:7], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v13, v[8:9], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v12, v[10:11], off +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v16 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v17, s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v14, v2 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v15, v2, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v12, v2 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v13, v2, v0 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog ; ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX7-UNALIGNED: ; %bb.0: @@ -651,35 +651,35 @@ ; ; GFX7-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX7-NOUNALIGNED: ; %bb.0: -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:8 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:10 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 -; GFX7-NOUNALIGNED-NEXT: ; return to shader part epilog +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:8 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:10 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NOUNALIGNED-NEXT: ; return to shader part epilog %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 2 ret <3 x i32> %load } @@ -687,10 +687,10 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(<3 x i32> addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -709,10 +709,10 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(i96 addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_i96_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -731,10 +731,10 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(<3 x i32> addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -753,10 +753,10 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(<6 x i16> addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_v6i16_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -96,17 +96,17 @@ ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, gv3@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, gv3@gotpcrel32@hi+4 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: global_store_dword v[0:1], v2, off -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: BB1_2: ; %Flow ; GFX9-NEXT: s_and_b32 s0, s0, 1 @@ -119,16 +119,16 @@ ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, gv1@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, gv1@gotpcrel32@hi+4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dword v[0:1], v2, off -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: BB1_4: ; %bb2 ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -36,17 +36,17 @@ ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_load_dword s4, s[4:5], 0x10 -; GCN-NEXT: s_add_u32 s5, s32, 0x1000 +; GCN-NEXT: s_load_dword s8, s[4:5], 0x10 +; GCN-NEXT: s_add_u32 s4, s32, 0x1000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: s_add_u32 s5, s4, 4 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: s_add_u32 s8, s5, 4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 2 -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: s_lshl_b32 s5, s8, 2 ; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NEXT: s_add_u32 s4, s5, s4 +; GCN-NEXT: s_add_u32 s4, s4, s5 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen @@ -110,18 +110,18 @@ ; GCN-NEXT: s_cbranch_scc1 BB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_load_dword s4, s[4:5], 0xc -; GCN-NEXT: s_add_u32 s5, s32, 0x1000 -; GCN-NEXT: s_and_b32 s5, s5, 0xfffff000 -; GCN-NEXT: s_add_u32 s8, s5, 4 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xc +; GCN-NEXT: s_add_u32 s4, s32, 0x1000 +; GCN-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GCN-NEXT: s_add_u32 s5, s4, 4 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 2 -; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s5, s8, 2 ; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NEXT: s_add_u32 s4, s5, s4 +; GCN-NEXT: s_add_u32 s4, s4, s5 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-default.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-default.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-default.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-default.mir @@ -28,6 +28,7 @@ bb.0: ; CHECK-LABEL: name: test_fconstant_f16_1 ; CHECK: [[C:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH3C00 + ; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[C]](s16) %0:_(s16) = G_FCONSTANT half 1.0 %1:_(s32) = G_ANYEXT %0 ... diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -260,8 +260,8 @@ ; GFX9: v_pk_add_u16 ; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; VI: v_add_u16_sdwa ; VI: v_add_u16_e32 +; VI: v_add_u16_sdwa ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -360,7 +360,6 @@ ; IR-LABEL: @select_mul_lhs_const_i32( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000 ; IR-NEXT: ret i32 [[OP]] -; %select = select i1 %cond, i32 5, i32 8 %op = mul i32 1000, %select ret i32 %op @@ -380,7 +379,6 @@ ; IR-LABEL: @select_mul_rhs_const_i32( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000 ; IR-NEXT: ret i32 [[OP]] -; %select = select i1 %cond, i32 5, i32 8 %op = mul i32 %select, 1000 ret i32 %op @@ -420,7 +418,6 @@ ; IR-LABEL: @select_add_trunc_select( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50 ; IR-NEXT: ret i16 [[OP]] -; %select = select i1 %cond, i32 5, i32 8 %trunc = trunc i32 %select to i16 %op = add i16 %trunc, 42 diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -2,8 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs -; TRAP-HANDLER-ENABLE: NumSgprs: 61 -; TRAP-HANDLER-DISABLE: NumSgprs: 77 +; TRAP-HANDLER-ENABLE: NumSgprs: 63 +; TRAP-HANDLER-DISABLE: NumSgprs: 79 define amdgpu_kernel void @amdhsa_trap_num_sgprs( i32 addrspace(1)* %out0, i32 %in0, i32 addrspace(1)* %out1, i32 %in1, diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -211,29 +211,29 @@ ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz BB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX8-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s1, s0, s1 +; GFX8-NEXT: s_mul_i32 s3, s2, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 @@ -244,28 +244,28 @@ ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s1, s0, s1 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -277,20 +277,20 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s1, s0, s1 -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: s_mul_i32 s3, s2, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -299,9 +299,9 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -313,20 +313,20 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB1_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -335,9 +335,9 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 @@ -1764,29 +1764,29 @@ ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz BB9_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX8-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s1, s0, s1 +; GFX8-NEXT: s_mul_i32 s3, s2, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB9_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 @@ -1797,28 +1797,28 @@ ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB9_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s1, s0, s1 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB9_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -1830,20 +1830,20 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s1, s0, s1 -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: s_mul_i32 s3, s2, s3 +; GFX1064-NEXT: v_mov_b32_e32 v2, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -1852,9 +1852,9 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -1866,20 +1866,20 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB9_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -1888,9 +1888,9 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -125,7 +125,7 @@ declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 declare i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #1 -attributes #0 = { nounwind "amdgpu-num-sgpr"="14" } +attributes #0 = { nounwind "amdgpu-num-sgpr"="16" } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "amdgpu-num-sgpr"="12" } -attributes #3 = { nounwind "amdgpu-num-sgpr"="11" } +attributes #2 = { nounwind "amdgpu-num-sgpr"="14" } +attributes #3 = { nounwind "amdgpu-num-sgpr"="13" } diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -32,11 +32,11 @@ ; FLAT-LABEL: s_brev_i16: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dword s0, s[0:1], 0x2c +; FLAT-NEXT: s_load_dword s2, s[0:1], 0x2c ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b32 s0, s0 +; FLAT-NEXT: s_brev_b32 s0, s2 ; FLAT-NEXT: s_lshr_b32 s0, s0, 16 ; FLAT-NEXT: v_mov_b32_e32 v0, s0 ; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -66,13 +66,13 @@ ; FLAT-LABEL: v_brev_i16: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 -; FLAT-NEXT: s_mov_b32 s2, s6 -; FLAT-NEXT: s_mov_b32 s3, s7 +; FLAT-NEXT: s_mov_b32 s10, s6 +; FLAT-NEXT: s_mov_b32 s11, s7 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; FLAT-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -100,11 +100,11 @@ ; FLAT-LABEL: s_brev_i32: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dword s0, s[0:1], 0x2c +; FLAT-NEXT: s_load_dword s2, s[0:1], 0x2c ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b32 s0, s0 +; FLAT-NEXT: s_brev_b32 s0, s2 ; FLAT-NEXT: v_mov_b32_e32 v0, s0 ; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; FLAT-NEXT: s_endpgm @@ -134,13 +134,13 @@ ; FLAT-LABEL: v_brev_i32: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dword v0, v[0:1] ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -173,14 +173,14 @@ ; FLAT-LABEL: s_brev_v2i32: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b32 s1, s1 -; FLAT-NEXT: s_brev_b32 s0, s0 -; FLAT-NEXT: v_mov_b32_e32 v0, s0 -; FLAT-NEXT: v_mov_b32_e32 v1, s1 +; FLAT-NEXT: s_brev_b32 s0, s3 +; FLAT-NEXT: s_brev_b32 s1, s2 +; FLAT-NEXT: v_mov_b32_e32 v0, s1 +; FLAT-NEXT: v_mov_b32_e32 v1, s0 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 @@ -210,13 +210,13 @@ ; FLAT-LABEL: v_brev_v2i32: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -384,18 +384,18 @@ ; FLAT-LABEL: v_brev_i64: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; FLAT-NEXT: s_mov_b32 s3, 0x33333333 +; FLAT-NEXT: s_mov_b32 s0, 0x10203 +; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f ; FLAT-NEXT: s_mov_b32 s6, 0xcccccccc ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0x10203 -; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 +; FLAT-NEXT: s_mov_b32 s3, 0x33333333 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; FLAT-NEXT: v_perm_b32 v2, 0, v0, s0 @@ -518,51 +518,51 @@ ; FLAT-LABEL: s_brev_v2i64: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; FLAT-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; FLAT-NEXT: v_mov_b32_e32 v4, 0x10203 -; FLAT-NEXT: s_mov_b32 s8, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s9, 0xcccccccc -; FLAT-NEXT: s_mov_b32 s10, 0x55555555 +; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f0 +; FLAT-NEXT: s_mov_b32 s1, 0x33333333 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_perm_b32 v3, 0, s2, v4 -; FLAT-NEXT: v_perm_b32 v2, 0, s3, v4 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; FLAT-NEXT: v_and_b32_e32 v0, s8, v2 -; FLAT-NEXT: v_and_b32_e32 v1, s8, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s2, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s2, v3 +; FLAT-NEXT: v_perm_b32 v3, 0, s10, v4 +; FLAT-NEXT: v_perm_b32 v2, 0, s11, v4 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s0, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s0, v3 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] -; FLAT-NEXT: v_perm_b32 v7, 0, s0, v4 -; FLAT-NEXT: v_perm_b32 v6, 0, s1, v4 +; FLAT-NEXT: v_perm_b32 v7, 0, s8, v4 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: s_mov_b32 s3, 0x33333333 ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_and_b32_e32 v0, s3, v2 -; FLAT-NEXT: v_and_b32_e32 v1, s3, v3 -; FLAT-NEXT: v_and_b32_e32 v4, s8, v6 -; FLAT-NEXT: v_and_b32_e32 v5, s8, v7 -; FLAT-NEXT: v_and_b32_e32 v2, s9, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s9, v3 -; FLAT-NEXT: v_and_b32_e32 v6, s2, v6 -; FLAT-NEXT: v_and_b32_e32 v7, s2, v7 +; FLAT-NEXT: v_perm_b32 v6, 0, s9, v4 +; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc +; FLAT-NEXT: v_and_b32_e32 v0, s1, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s1, v3 +; FLAT-NEXT: v_and_b32_e32 v4, s2, v6 +; FLAT-NEXT: v_and_b32_e32 v5, s2, v7 +; FLAT-NEXT: v_and_b32_e32 v2, s3, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 +; FLAT-NEXT: v_and_b32_e32 v6, s0, v6 +; FLAT-NEXT: v_and_b32_e32 v7, s0, v7 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5] ; FLAT-NEXT: v_lshrrev_b64 v[6:7], 4, v[6:7] ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 ; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 +; FLAT-NEXT: s_mov_b32 s10, 0x55555555 +; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: s_mov_b32 s11, 0xaaaaaaaa ; FLAT-NEXT: v_and_b32_e32 v0, s10, v2 ; FLAT-NEXT: v_and_b32_e32 v1, s10, v3 -; FLAT-NEXT: v_and_b32_e32 v4, s3, v6 -; FLAT-NEXT: v_and_b32_e32 v5, s3, v7 +; FLAT-NEXT: v_and_b32_e32 v4, s1, v6 +; FLAT-NEXT: v_and_b32_e32 v5, s1, v7 ; FLAT-NEXT: v_and_b32_e32 v2, s11, v2 ; FLAT-NEXT: v_and_b32_e32 v3, s11, v3 -; FLAT-NEXT: v_and_b32_e32 v6, s9, v6 -; FLAT-NEXT: v_and_b32_e32 v7, s9, v7 +; FLAT-NEXT: v_and_b32_e32 v6, s3, v6 +; FLAT-NEXT: v_and_b32_e32 v7, s3, v7 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] @@ -675,18 +675,18 @@ ; FLAT-LABEL: v_brev_v2i64: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; FLAT-NEXT: s_mov_b32 s3, 0x33333333 +; FLAT-NEXT: s_mov_b32 s0, 0x10203 +; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f ; FLAT-NEXT: s_mov_b32 s8, 0xcccccccc ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0x10203 -; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 +; FLAT-NEXT: s_mov_b32 s3, 0x33333333 ; FLAT-NEXT: s_mov_b32 s9, 0x55555555 ; FLAT-NEXT: s_mov_b32 s10, 0xaaaaaaaa ; FLAT-NEXT: s_mov_b32 s7, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir b/llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir --- a/llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir +++ b/llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir @@ -1,5 +1,5 @@ # RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s -# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s +# RUN: llc -march=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s --- # Trivial clause at beginning of program diff --git a/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir --- a/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir +++ b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir @@ -1,8 +1,8 @@ # RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s # Make sure the default assumption is xnack enabled with no cpu -# RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+volcanic-islands -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s -# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s +# RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+volcanic-islands -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s +# RUN: llc -march=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s --- # Trivial clause at beginning of program name: trivial_clause_load_flat4_x1 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll --- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll @@ -15,27 +15,27 @@ ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 32, align 1, addrspace 4) ; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 48, align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 64, align 1, addrspace 4) ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 80, align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 96, align 1, addrspace 4) @@ -49,13 +49,13 @@ ; GCN: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF3]].sub0 - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec ; GCN: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7" + 112, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4) ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64 ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4) @@ -64,7 +64,7 @@ ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4) ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72 ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4) @@ -73,7 +73,7 @@ ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4) ; GCN: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80 ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4) @@ -82,7 +82,7 @@ ; GCN: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4) ; GCN: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub0 @@ -101,7 +101,7 @@ ; GCN: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[DEF8]].sub0 - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4) ; GCN: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96 ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4) @@ -110,7 +110,7 @@ ; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4) ; GCN: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104 ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4) @@ -119,7 +119,7 @@ ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4) ; GCN: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112 @@ -135,7 +135,7 @@ ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4) ; GCN: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120 @@ -150,7 +150,7 @@ ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4) ; GCN: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] @@ -164,7 +164,7 @@ ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: [[DEF9:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4) @@ -193,7 +193,7 @@ ; GCN: [[DEF15:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[DEF15]].sub0 - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4) ; GCN: [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] @@ -207,7 +207,7 @@ ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4) ; GCN: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152 diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -351,6 +351,7 @@ ; GFX803-NEXT: ;;#ASMEND ; GFX803-NEXT: s_mov_b32 s4, 0x40000 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload +; GFX803-NEXT: s_nop 0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; GFX803-NEXT: s_endpgm @@ -369,6 +370,7 @@ ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s6, 0x40000 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; GFX900-NEXT: s_endpgm @@ -384,6 +386,7 @@ ; GFX1010-NEXT: s_mov_b32 s6, 0x20000 ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 ; GFX1010-NEXT: ; implicit-def: $vcc_hi +; GFX1010-NEXT: s_nop 0 ; GFX1010-NEXT: s_waitcnt vmcnt(0) ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 @@ -391,6 +394,7 @@ ; GFX1010-NEXT: ;;#ASMSTART ; GFX1010-NEXT: ;;#ASMEND ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload +; GFX1010-NEXT: s_nop 0 ; GFX1010-NEXT: s_waitcnt vmcnt(0) ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; GFX1010-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -6,6 +6,7 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -27,6 +28,7 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -131,6 +133,7 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_ushort v0, v[0:1], off +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -175,6 +178,7 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_load_ushort v0, v[0:1] +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_load_short_d16_hi v0, v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -202,9 +206,11 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: global_load_ushort v2, v[0:1], off +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:6 ; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:4 @@ -213,6 +219,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:8 ; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -323,6 +330,7 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_short_d16_hi v0, v[0:1], off ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -345,6 +353,7 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_load_ushort v2, v[0:1] offset:2 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: flat_load_short_d16_hi v0, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir --- a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir +++ b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=tonga -run-pass post-RA-sched -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=tonga -mattr=-xnack -run-pass post-RA-sched -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s # GCN: FLAT_LOAD_DWORD # GCN-NEXT: FLAT_LOAD_DWORD diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -1,7 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefixes=CHECK,GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefixes=CHECK,GCN %s ; RUN: FileCheck --enable-var-scope --check-prefixes=CHECK,DBG %s < %t ; REQUIRES: asserts +; FIXME: Verifier error with xnack enabled. + ; CHECK-LABEL: {{^}}cluster_load_cluster_store: define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noalias %sb) { bb: diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll --- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll @@ -15,7 +15,7 @@ ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 12 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel @@ -33,7 +33,7 @@ ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 12 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf-broken.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf-broken.mir --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf-broken.mir +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf-broken.mir @@ -17,12 +17,14 @@ ; GXN: successors: %bb.1(0x80000000) ; GXN: liveins: $vgpr0, $sgpr0_sgpr1 ; GXN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1 - ; GXN: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GXN: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $exec + ; GXN: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GXN: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $exec ; GXN: bb.1: ; GXN: successors: %bb.2(0x80000000) - ; GXN: bb.2: ; GXN: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc + ; GXN: bb.2: + ; GXN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc ; GXN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GXN: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GXN: DS_WRITE_B32 [[DEF]], [[DEF1]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -21,13 +21,16 @@ } ; GCN-LABEL: {{^}}load_v2i32: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 +; SICI-DAG: s_mov_b32 s3, 0 +; SICI-DAG: s_mov_b32 s2, s1 +; SICI-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 +; VIGFX9-DAG: s_mov_b32 s6, s1 +; VIGFX9-DAG: s_mov_b32 s7, 0 +; VIGFX9-DAG: s_mov_b32 s1, s7 ; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[6:7], 0x10 define amdgpu_vs <2 x float> @load_v2i32(<2 x i32> addrspace(6)* inreg %p0, <2 x i32> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(6)* %p1, i32 2 %r0 = load <2 x i32>, <2 x i32> addrspace(6)* %p0 @@ -105,13 +108,16 @@ } ; GCN-LABEL: {{^}}load_v2float: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 +; SICI-DAG: s_mov_b32 s3, 0 +; SICI-DAG: s_mov_b32 s2, s1 +; SICI-DAG: s_mov_b32 s1, s3 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 +; VIGFX9-DAG: s_mov_b32 s6, s1 +; VIGFX9-DAG: s_mov_b32 s7, 0 +; VIGFX9-DAG: s_mov_b32 s1, s7 ; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[6:7], 0x10 define amdgpu_vs <2 x float> @load_v2float(<2 x float> addrspace(6)* inreg %p0, <2 x float> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(6)* %p1, i32 2 %r0 = load <2 x float>, <2 x float> addrspace(6)* %p0 diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -73,18 +73,18 @@ ; VI-LABEL: test_copy_v4i8_x2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s8, s6 ; VI-NEXT: s_mov_b32 s9, s7 @@ -279,14 +279,14 @@ ; VI-LABEL: test_copy_v4i8_extra_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_movk_i32 s12, 0xff00 ; VI-NEXT: s_movk_i32 s13, 0xff ; VI-NEXT: s_movk_i32 s14, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -585,6 +585,7 @@ ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -35,13 +35,13 @@ ; VI-LABEL: s_ctlz_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s1, s0 -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_cselect_b32 s0, s1, 32 +; VI-NEXT: s_flbit_i32_b32 s0, s2 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cselect_b32 s0, s0, 32 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -86,13 +86,13 @@ ; VI-LABEL: v_ctlz_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -156,13 +156,13 @@ ; VI-LABEL: v_ctlz_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -238,13 +238,13 @@ ; VI-LABEL: v_ctlz_v4i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -322,13 +322,13 @@ ; VI-LABEL: v_ctlz_i8: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 @@ -398,19 +398,19 @@ ; VI-LABEL: s_ctlz_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s0 -; VI-NEXT: s_add_i32 s2, s2, 32 -; VI-NEXT: s_flbit_i32_b32 s3, s1 -; VI-NEXT: s_cmp_eq_u32 s1, 0 -; VI-NEXT: s_cselect_b32 s2, s2, s3 -; VI-NEXT: s_or_b32 s0, s0, s1 -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_cselect_b32 s0, s2, 64 +; VI-NEXT: s_flbit_i32_b32 s0, s2 +; VI-NEXT: s_add_i32 s0, s0, 32 +; VI-NEXT: s_flbit_i32_b32 s1, s3 +; VI-NEXT: s_cmp_eq_u32 s3, 0 +; VI-NEXT: s_cselect_b32 s0, s0, s1 +; VI-NEXT: s_or_b32 s1, s2, s3 +; VI-NEXT: s_cmp_lg_u32 s1, 0 +; VI-NEXT: s_cselect_b32 s0, s0, 64 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -461,18 +461,18 @@ ; VI-LABEL: s_ctlz_i64_trunc: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s0 -; VI-NEXT: s_add_i32 s2, s2, 32 -; VI-NEXT: s_flbit_i32_b32 s3, s1 -; VI-NEXT: s_cmp_eq_u32 s1, 0 -; VI-NEXT: s_cselect_b32 s2, s2, s3 -; VI-NEXT: s_or_b32 s0, s0, s1 -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_cselect_b32 s0, s2, 64 +; VI-NEXT: s_flbit_i32_b32 s0, s2 +; VI-NEXT: s_add_i32 s0, s0, 32 +; VI-NEXT: s_flbit_i32_b32 s1, s3 +; VI-NEXT: s_cmp_eq_u32 s3, 0 +; VI-NEXT: s_cselect_b32 s0, s0, s1 +; VI-NEXT: s_or_b32 s1, s2, s3 +; VI-NEXT: s_cmp_lg_u32 s1, 0 +; VI-NEXT: s_cselect_b32 s0, s0, 64 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -527,14 +527,14 @@ ; VI-LABEL: v_ctlz_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v3 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3 @@ -614,14 +614,14 @@ ; VI-LABEL: v_ctlz_i64_trunc: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] ; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 @@ -696,13 +696,13 @@ ; VI-LABEL: v_ctlz_i32_sel_eq_neg1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -763,13 +763,13 @@ ; VI-LABEL: v_ctlz_i32_sel_ne_neg1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -835,13 +835,13 @@ ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -912,13 +912,13 @@ ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -984,12 +984,12 @@ ; VI-LABEL: v_ctlz_i8_sel_eq_neg1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1053,13 +1053,13 @@ ; VI-LABEL: v_ctlz_i16_sel_eq_neg1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 @@ -1127,12 +1127,12 @@ ; VI-LABEL: v_ctlz_i7_sel_eq_neg1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -11,8 +11,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; FUNC-LABEL: {{^}}s_ctpop_i16: -; GCN: s_load_dword [[SVAL:s[0-9]+]], -; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]] +; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], s{{[0-9]+}} ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; GCN: buffer_store_short [[VRESULT]], ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -401,12 +401,12 @@ ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -443,13 +443,13 @@ ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -489,13 +489,13 @@ ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -536,13 +536,13 @@ ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -595,13 +595,13 @@ ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -609,18 +609,18 @@ ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v4, v[4:5] -; VI-NEXT: flat_load_ubyte v5, v[6:7] -; VI-NEXT: flat_load_ubyte v6, v[2:3] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v9, v[4:5] +; VI-NEXT: flat_load_ubyte v10, v[6:7] +; VI-NEXT: flat_load_ubyte v8, v[2:3] +; VI-NEXT: flat_load_ubyte v11, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v9 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v10 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v8 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v11 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -677,20 +677,20 @@ ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v5, 9 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_movk_i32 s0, 0x900 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 @@ -698,7 +698,7 @@ ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 ; VI-NEXT: v_add_u16_e32 v8, 9, v4 ; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -709,7 +709,7 @@ ; VI-NEXT: v_add_u16_e32 v0, s0, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -765,13 +765,13 @@ ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -786,26 +786,26 @@ ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v8, v[8:9] -; VI-NEXT: flat_load_ubyte v9, v[10:11] -; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: flat_load_ubyte v7, v[4:5] -; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v15, v[8:9] +; VI-NEXT: flat_load_ubyte v16, v[10:11] +; VI-NEXT: flat_load_ubyte v14, v[6:7] +; VI-NEXT: flat_load_ubyte v18, v[4:5] +; VI-NEXT: flat_load_ubyte v13, v[2:3] +; VI-NEXT: flat_load_ubyte v17, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v15 ; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v9 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v16 ; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v14 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v7 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v18 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 ; VI-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v17 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -847,13 +847,13 @@ ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[7:8], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -898,13 +898,13 @@ ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -943,13 +943,13 @@ ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -988,12 +988,12 @@ ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1040,13 +1040,13 @@ ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -1054,18 +1054,18 @@ ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v4, v[4:5] -; VI-NEXT: flat_load_ubyte v5, v[6:7] -; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v9, v[4:5] +; VI-NEXT: flat_load_ubyte v10, v[6:7] +; VI-NEXT: flat_load_ubyte v8, v[2:3] +; VI-NEXT: flat_load_ubyte v11, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v8 +; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v11 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v10 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1098,13 +1098,13 @@ ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1141,13 +1141,13 @@ ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1185,13 +1185,13 @@ ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1229,13 +1229,13 @@ ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll --- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll +++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll @@ -47,12 +47,12 @@ ; GFX702: .amdgcn_target "amdgcn-amd-amdhsa--gfx702" ; GFX703: .amdgcn_target "amdgcn-amd-amdhsa--gfx703" ; GFX704: .amdgcn_target "amdgcn-amd-amdhsa--gfx704" -; GFX801: .amdgcn_target "amdgcn-amd-amdhsa--gfx801+xnack" +; GFX801: .amdgcn_target "amdgcn-amd-amdhsa--gfx801" ; GFX802: .amdgcn_target "amdgcn-amd-amdhsa--gfx802" ; GFX803: .amdgcn_target "amdgcn-amd-amdhsa--gfx803" -; GFX810: .amdgcn_target "amdgcn-amd-amdhsa--gfx810+xnack" +; GFX810: .amdgcn_target "amdgcn-amd-amdhsa--gfx810" ; GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900" -; GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902+xnack" +; GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902" ; GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904" ; GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906" diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll @@ -82,18 +82,14 @@ ; GFX703: EF_AMDGPU_MACH_AMDGCN_GFX703 (0x25) ; GFX704: EF_AMDGPU_MACH_AMDGCN_GFX704 (0x26) ; GFX801: EF_AMDGPU_MACH_AMDGCN_GFX801 (0x28) -; GFX801-NEXT: EF_AMDGPU_XNACK (0x100) ; GFX802: EF_AMDGPU_MACH_AMDGCN_GFX802 (0x29) ; GFX803: EF_AMDGPU_MACH_AMDGCN_GFX803 (0x2A) ; GFX810: EF_AMDGPU_MACH_AMDGCN_GFX810 (0x2B) -; GFX810-NEXT: EF_AMDGPU_XNACK (0x100) ; GFX900: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) ; GFX902: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D) -; GFX902-NEXT: EF_AMDGPU_XNACK (0x100) ; GFX904: EF_AMDGPU_MACH_AMDGCN_GFX904 (0x2E) ; GFX906: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) ; GFX908: EF_AMDGPU_MACH_AMDGCN_GFX908 (0x30) -; GFX908-NEXT: EF_AMDGPU_SRAM_ECC (0x200) ; GFX909: EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31) ; GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33) ; GFX1011: EF_AMDGPU_MACH_AMDGCN_GFX1011 (0x34) diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll @@ -3,7 +3,7 @@ ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc,+xnack < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s -; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx908 < %s | llvm-readobj -file-headers - | FileCheck --check-prefix=SRAM-ECC-GFX908 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx908 -mattr=+sram-ecc < %s | llvm-readobj -file-headers - | FileCheck --check-prefix=SRAM-ECC-GFX908 %s ; NO-SRAM-ECC-GFX906: Flags [ ; NO-SRAM-ECC-GFX906-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) @@ -23,7 +23,6 @@ ; SRAM-ECC-GFX908: Flags [ ; SRAM-ECC-GFX908: EF_AMDGPU_MACH_AMDGCN_GFX908 (0x30) ; SRAM-ECC-GFX908: EF_AMDGPU_SRAM_ECC (0x200) -; SRAM-ECC-GFX908: EF_AMDGPU_XNACK (0x100) ; SRAM-ECC-GFX908: ] define amdgpu_kernel void @elf_header() { diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -33,9 +33,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ushort v3, v[0:1], off offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -31,9 +31,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen offset:2 +; GFX9-NEXT: buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -1,11 +1,11 @@ ; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=stoney -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=stoney -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -35,7 +35,6 @@ ; R600: -PV ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; VI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float %fabs = call float @llvm.fabs.f32(float %bc) diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-remove-m0-redef.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-remove-m0-redef.mir --- a/llvm/test/CodeGen/AMDGPU/fold-operands-remove-m0-redef.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-remove-m0-redef.mir @@ -29,6 +29,7 @@ liveins: $vgpr0, $sgpr0 ; GCN-LABEL: name: redef_m0_same_copy + ; GCN: liveins: $vgpr0, $sgpr0 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GCN: $m0 = COPY [[COPY1]] @@ -53,6 +54,7 @@ liveins: $vgpr0, $sgpr0 ; GCN-LABEL: name: multi_redef_m0_same_copy + ; GCN: liveins: $vgpr0, $sgpr0 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GCN: $m0 = COPY [[COPY1]] @@ -78,6 +80,7 @@ liveins: $vgpr0, $sgpr0, $sgpr1 ; GCN-LABEL: name: redef_m0_different_copy + ; GCN: liveins: $vgpr0, $sgpr0, $sgpr1 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 @@ -105,6 +108,7 @@ liveins: $vgpr0, $sgpr0, $sgpr1 ; GCN-LABEL: name: redef_m0_mixed_copy0 + ; GCN: liveins: $vgpr0, $sgpr0, $sgpr1 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 @@ -134,6 +138,7 @@ liveins: $vgpr0, $sgpr0, $sgpr1 ; GCN-LABEL: name: redef_m0_mixed_copy1 + ; GCN: liveins: $vgpr0, $sgpr0, $sgpr1 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 @@ -163,6 +168,7 @@ liveins: $vgpr0, $sgpr0 ; GCN-LABEL: name: redef_m0_same_mov_imm + ; GCN: liveins: $vgpr0, $sgpr0 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GCN: $m0 = S_MOV_B32 -1 @@ -187,6 +193,7 @@ liveins: $vgpr0, $sgpr0 ; GCN-LABEL: name: redef_m0_different_inst0 + ; GCN: liveins: $vgpr0, $sgpr0 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GCN: $m0 = COPY [[COPY1]] @@ -212,6 +219,7 @@ liveins: $vgpr0, $sgpr0 ; GCN-LABEL: name: redef_m0_different_inst1 + ; GCN: liveins: $vgpr0, $sgpr0 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GCN: $m0 = COPY [[COPY1]] @@ -237,6 +245,7 @@ liveins: $vgpr0, $sgpr0, $sgpr1 ; GCN-LABEL: name: redef_m0_mixed_read_m0 + ; GCN: liveins: $vgpr0, $sgpr0, $sgpr1 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 @@ -266,6 +275,7 @@ liveins: $vgpr0, $sgpr0 ; GCN-LABEL: name: redef_m0_same_copy_call + ; GCN: liveins: $vgpr0, $sgpr0 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GCN: $m0 = COPY [[COPY1]] @@ -292,6 +302,7 @@ ; GCN-LABEL: name: redef_m0_same_copy_multi_block ; GCN: bb.0: ; GCN: successors: %bb.1(0x80000000) + ; GCN: liveins: $vgpr0, $sgpr0 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GCN: $m0 = COPY [[COPY1]] @@ -323,6 +334,7 @@ liveins: $vgpr0, $sgpr0 ; GCN-LABEL: name: redef_m0_copy_self + ; GCN: liveins: $vgpr0, $sgpr0 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GCN: $m0 = COPY [[COPY1]] @@ -348,6 +360,7 @@ liveins: $vgpr0, $sgpr0 ; GCN-LABEL: name: redef_m0_copy_physreg + ; GCN: liveins: $vgpr0, $sgpr0 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GCN: $m0 = COPY $sgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir @@ -62,7 +62,7 @@ ; CHECK: S_WAITCNT 0 ; CHECK: S_NOP 0, implicit-def $exec ; CHECK: $sgpr0_sgpr1 = S_MOV_B64 $exec - ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr0, 0, undef $vgpr0 + ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr0, 0, undef $vgpr0, implicit-def $sgpr0_sgpr1 ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr1, 1, killed $vgpr0 ; CHECK: $sgpr0 = V_READLANE_B32_vi $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK: $sgpr1 = V_READLANE_B32_vi $vgpr0, 1 @@ -135,7 +135,7 @@ ; CHECK: liveins: $vgpr0 ; CHECK: S_WAITCNT 0 ; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec - ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr0, 0, undef $vgpr0 + ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr0, 0, undef $vgpr0, implicit-def $sgpr0_sgpr1 ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr1, 1, killed $vgpr0 ; CHECK: $sgpr0 = V_READLANE_B32_vi $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK: $sgpr1 = V_READLANE_B32_vi $vgpr0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -89,12 +89,12 @@ ; VI-LABEL: frem_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 8 +; VI-NEXT: s_add_u32 s0, s2, 8 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: flat_load_ushort v4, v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -184,12 +184,12 @@ ; VI-LABEL: fast_frem_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 8 +; VI-NEXT: s_add_u32 s0, s2, 8 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: flat_load_ushort v4, v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -274,12 +274,12 @@ ; VI-LABEL: unsafe_frem_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 8 +; VI-NEXT: s_add_u32 s0, s2, 8 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: flat_load_ushort v4, v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -378,12 +378,12 @@ ; VI-LABEL: frem_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_add_u32 s0, s2, 16 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: flat_load_dword v4, v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -471,12 +471,12 @@ ; VI-LABEL: fast_frem_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_add_u32 s0, s2, 16 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: flat_load_dword v4, v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -553,12 +553,12 @@ ; VI-LABEL: unsafe_frem_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_add_u32 s0, s2, 16 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: flat_load_dword v4, v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -674,31 +674,31 @@ ; VI-LABEL: frem_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[8:9], v[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3] -; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] -; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3] -; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] -; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; VI-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[8:9], v[8:9], v[6:7] +; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-NEXT: v_fma_f64 v[10:11], -v[2:3], v[4:5], 1.0 +; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[4:5] +; VI-NEXT: v_fma_f64 v[10:11], -v[2:3], v[4:5], 1.0 +; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[4:5] +; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[8:9], v[6:7] +; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[4:5] +; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[10:11] ; VI-NEXT: s_nop 1 -; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] -; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3] -; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] +; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[12:13] +; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[8:9], v[6:7] +; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] +; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm double addrspace(1)* %in2) #0 { @@ -802,31 +802,31 @@ ; VI-LABEL: fast_frem_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[8:9], v[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3] -; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] -; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3] -; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] -; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; VI-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[8:9], v[8:9], v[6:7] +; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-NEXT: v_fma_f64 v[10:11], -v[2:3], v[4:5], 1.0 +; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[4:5] +; VI-NEXT: v_fma_f64 v[10:11], -v[2:3], v[4:5], 1.0 +; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[4:5] +; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[8:9], v[6:7] +; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[4:5] +; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[10:11] ; VI-NEXT: s_nop 1 -; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] -; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3] -; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] +; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[12:13] +; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[8:9], v[6:7] +; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] +; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm double addrspace(1)* %in2) #0 { @@ -906,21 +906,21 @@ ; VI-LABEL: unsafe_frem_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[8:9], v[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; VI-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] -; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] +; VI-NEXT: v_rcp_f64_e32 v[2:3], v[8:9] +; VI-NEXT: v_mul_f64 v[2:3], v[6:7], v[2:3] +; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] +; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm double addrspace(1)* %in2) #1 { @@ -1065,12 +1065,12 @@ ; VI-LABEL: frem_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_add_u32 s0, s2, 16 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: flat_load_dword v4, v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -1324,11 +1324,11 @@ ; VI-LABEL: frem_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 32 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 32 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] @@ -1499,20 +1499,20 @@ ; VI-LABEL: frem_v2f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s2, 3 -; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 32 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 32 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: s_mov_b32 s2, 3 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_mov_b32 s3, 0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3 ; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3 @@ -1725,20 +1725,20 @@ ; VI-LABEL: frem_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s2, 3 -; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_add_u32 s0, s0, 64 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 64 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: s_mov_b32 s2, 3 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: s_mov_b32 s3, 0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3 ; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 @@ -1950,17 +1950,17 @@ ; VI-LABEL: frem_v2f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_add_u32 s0, s0, 64 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 64 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -29,39 +29,39 @@ ; ; VI-LABEL: fshl_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sub_i32 s3, 32, s2 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: s_and_b32 s1, s2, 31 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_sub_i32 s0, 32, s6 +; VI-NEXT: s_and_b32 s1, s6, 31 ; VI-NEXT: s_cmp_eq_u32 s1, 0 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_alignbit_b32 v0, s4, v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s3, 32, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, 31 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_sub_i32 s0, 32, s6 +; GFX9-NEXT: s_and_b32 s1, s6, 31 ; GFX9-NEXT: s_cmp_eq_u32 s1, 0 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -102,10 +102,10 @@ ; VI-LABEL: fshl_i32_imm: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_alignbit_b32 v2, s0, v0, 25 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_alignbit_b32 v2, s4, v0, 25 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -114,10 +114,10 @@ ; GFX9-LABEL: fshl_i32_imm: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, 25 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_alignbit_b32 v2, s4, v0, 25 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -174,22 +174,22 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: s_sub_i32 s8, 32, s1 -; VI-NEXT: s_and_b32 s1, s1, 31 -; VI-NEXT: v_mov_b32_e32 v1, s8 -; VI-NEXT: s_cmp_eq_u32 s1, 0 -; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1 +; VI-NEXT: s_sub_i32 s0, 32, s9 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_and_b32 s0, s9, 31 +; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_sub_i32 s0, 32, s8 +; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_sub_i32 s1, 32, s0 -; VI-NEXT: s_and_b32 s0, s0, 31 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_and_b32 s0, s8, 31 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, v2 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -204,22 +204,22 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s8, 32, s1 -; GFX9-NEXT: s_and_b32 s1, s1, 31 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: s_cmp_eq_u32 s1, 0 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1 +; GFX9-NEXT: s_sub_i32 s0, 32, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_and_b32 s0, s9, 31 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_sub_i32 s0, 32, s8 +; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_sub_i32 s1, 32, s0 -; GFX9-NEXT: s_and_b32 s0, s0, 31 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_and_b32 s0, s8, 31 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -276,10 +276,10 @@ ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 23 ; VI-NEXT: v_alignbit_b32 v0, s4, v2, 25 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -291,10 +291,10 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v2, 25 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -368,97 +368,97 @@ ; ; VI-LABEL: fshl_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: s_sub_i32 s14, 32, s3 -; VI-NEXT: s_and_b32 s3, s3, 31 -; VI-NEXT: v_mov_b32_e32 v1, s14 -; VI-NEXT: s_cmp_eq_u32 s3, 0 +; VI-NEXT: s_sub_i32 s0, 32, s15 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_and_b32 s0, s15, 31 +; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: v_alignbit_b32 v0, s7, v0, v1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_sub_i32 s3, 32, s2 -; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_sub_i32 s0, 32, s14 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; VI-NEXT: s_cmp_eq_u32 s2, 0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_and_b32 s0, s14, 31 ; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: v_alignbit_b32 v0, s6, v0, v1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: s_sub_i32 s2, 32, s1 -; VI-NEXT: s_and_b32 s1, s1, 31 +; VI-NEXT: s_sub_i32 s0, 32, s13 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; VI-NEXT: s_cmp_eq_u32 s1, 0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_and_b32 s0, s13, 31 +; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_sub_i32 s0, 32, s12 +; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_sub_i32 s1, 32, s0 -; VI-NEXT: s_and_b32 s0, s0, 31 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_and_b32 s0, s12, 31 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4 ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: s_sub_i32 s14, 32, s3 -; GFX9-NEXT: s_and_b32 s3, s3, 31 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: s_cmp_eq_u32 s3, 0 +; GFX9-NEXT: s_sub_i32 s0, 32, s15 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_and_b32 s0, s15, 31 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, v1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s3, 32, s2 -; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_sub_i32 s0, 32, s14 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_and_b32 s0, s14, 31 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, v1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: s_sub_i32 s2, 32, s1 -; GFX9-NEXT: s_and_b32 s1, s1, 31 +; GFX9-NEXT: s_sub_i32 s0, 32, s13 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX9-NEXT: s_cmp_eq_u32 s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_and_b32 s0, s13, 31 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_sub_i32 s0, 32, s12 +; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_sub_i32 s1, 32, s0 -; GFX9-NEXT: s_and_b32 s0, s0, 31 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: s_and_b32 s0, s12, 31 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -523,38 +523,38 @@ ; ; VI-LABEL: fshl_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_alignbit_b32 v3, s7, v0, 31 -; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_alignbit_b32 v2, s6, v1, 23 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 25 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, 31 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 23 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -33,27 +33,27 @@ ; ; VI-LABEL: fshr_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v2, s0, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_alignbit_b32 v2, s4, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_alignbit_b32 v2, s4, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -89,10 +89,10 @@ ; VI-LABEL: fshr_i32_imm: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_alignbit_b32 v2, s0, v0, 7 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_alignbit_b32 v2, s4, v0, 7 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -101,10 +101,10 @@ ; GFX9-LABEL: fshr_i32_imm: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, 7 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_alignbit_b32 v2, s4, v0, 7 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -157,14 +157,14 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: s_and_b32 s1, s1, 31 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_cmp_eq_u32 s1, 0 +; VI-NEXT: s_and_b32 s0, s9, 31 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_and_b32 s0, s0, 31 +; VI-NEXT: s_and_b32 s0, s8, 31 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc ; VI-NEXT: s_cmp_eq_u32 s0, 0 @@ -183,14 +183,14 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_and_b32 s1, s1, 31 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_cmp_eq_u32 s1, 0 +; GFX9-NEXT: s_and_b32 s0, s9, 31 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_and_b32 s0, s0, 31 +; GFX9-NEXT: s_and_b32 s0, s8, 31 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc ; GFX9-NEXT: s_cmp_eq_u32 s0, 0 @@ -249,10 +249,10 @@ ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 9 ; VI-NEXT: v_alignbit_b32 v0, s4, v2, 7 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -264,10 +264,10 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v2, 7 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -333,31 +333,31 @@ ; ; VI-LABEL: fshr_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: s_and_b32 s3, s3, 31 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_cmp_eq_u32 s3, 0 +; VI-NEXT: s_and_b32 s0, s15, 31 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_and_b32 s0, s14, 31 ; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 ; VI-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc -; VI-NEXT: s_cmp_eq_u32 s2, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_and_b32 s1, s1, 31 +; VI-NEXT: s_and_b32 s0, s13, 31 ; VI-NEXT: v_alignbit_b32 v1, s6, v0, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc -; VI-NEXT: s_cmp_eq_u32 s1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_and_b32 s0, s0, 31 +; VI-NEXT: s_and_b32 s0, s12, 31 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc ; VI-NEXT: s_cmp_eq_u32 s0, 0 @@ -366,38 +366,38 @@ ; VI-NEXT: v_alignbit_b32 v4, s4, v0, v4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: s_and_b32 s3, s3, 31 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_cmp_eq_u32 s3, 0 +; GFX9-NEXT: s_and_b32 s0, s15, 31 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_and_b32 s0, s14, 31 ; GFX9-NEXT: v_alignbit_b32 v1, s7, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc -; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 31 +; GFX9-NEXT: s_and_b32 s0, s13, 31 ; GFX9-NEXT: v_alignbit_b32 v1, s6, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc -; GFX9-NEXT: s_cmp_eq_u32 s1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_and_b32 s0, s0, 31 +; GFX9-NEXT: s_and_b32 s0, s12, 31 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc ; GFX9-NEXT: s_cmp_eq_u32 s0, 0 @@ -406,8 +406,8 @@ ; GFX9-NEXT: v_alignbit_b32 v4, s4, v0, v4 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -467,38 +467,38 @@ ; ; VI-LABEL: fshr_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_alignbit_b32 v3, s7, v0, 1 -; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_alignbit_b32 v2, s6, v1, 9 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 7 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 9 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -389,6 +389,7 @@ ; FIXME: Should not scalarize ; GCN-LABEL: {{^}}v5i16_func_void: ; GFX9: buffer_load_dwordx2 v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v2 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr.ll b/llvm/test/CodeGen/AMDGPU/global-saddr.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr.ll @@ -87,6 +87,7 @@ ; GFX9-LABEL: {{^}}_amdgpu_cs_main: ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:16{{$}} +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt ; GFX9-NOT: global_load_dword diff --git a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir @@ -1,6 +1,6 @@ # RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GCX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s # GCN-LABEL: name: break_smem_clause_simple_load_smrd8_ptr_hidden_bundle # GCN: bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir @@ -1,6 +1,6 @@ # RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s # GCN-LABEL: name: break_smem_clause_max_look_ahead_in_bundle # GCN: S_LOAD_DWORDX2_IMM diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -1,7 +1,7 @@ ; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=WAVE64 --check-prefix=NOTES %s -; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=WAVE64 --check-prefix=NOTES %s -; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=WAVE64 --check-prefix=NOTES %s -; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX1010 --check-prefix=WAVE32 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3,-xnack -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=WAVE64 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3,-xnack -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=WAVE64 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3,-xnack -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX1010 --check-prefix=WAVE32 --check-prefix=NOTES %s @var = addrspace(1) global float 0.0 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3,-xnack -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3,-xnack -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s @var = addrspace(1) global float 0.0 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -5,8 +5,8 @@ ; GFX9-LABEL: udiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_sub_i32 s3, 0, s2 @@ -65,8 +65,8 @@ ; GFX9-LABEL: urem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_sub_i32 s3, 0, s2 @@ -123,6 +123,7 @@ ; GFX9-LABEL: sdiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s3, 31 @@ -183,6 +184,7 @@ ; GFX9-LABEL: srem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s3, s2, 31 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -38,83 +38,85 @@ ; GFX8-LABEL: udot2: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -174,72 +176,72 @@ ; GFX8-LABEL: udot2_MulMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, s5, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s9, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mul_u32_u24_e32 v0, s2, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_MulMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v0, s5, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_add_u32_e32 v2, s9, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v0, s2, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_add_u32_e32 v2, s5, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MulMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v0, s5, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_add_u32_e32 v2, s9, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v0, s2, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_add_u32_e32 v2, s5, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -247,23 +249,24 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s2, s5 -; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s5, s6 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s4, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_and_b32 s5, s0, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s1, s4 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s4, s5 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s8, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -317,81 +320,85 @@ ; GFX8-LABEL: idot2: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s6, s3 -; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 16 +; GFX8-NEXT: s_sext_i32_i16 s5, s1 +; GFX8-NEXT: s_ashr_i32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -446,92 +453,96 @@ ; GFX8-LABEL: idot2_MixedTypedMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s6, s3 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_mad_u32_u24 v0, s3, v1, v0 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_sext_i32_i16 s5, s1 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2_MixedTypedMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedTypedMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedTypedMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -588,83 +599,85 @@ ; GFX8-LABEL: udot2_alt_AddOperands: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_alt_AddOperands: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_alt_AddOperands: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -719,92 +732,96 @@ ; GFX8-LABEL: idot2_MixedExt: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s6, s3, 0xffff -; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s1, 0xffff +; GFX8-NEXT: s_ashr_i32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2_MixedExt: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_and_b32 s6, s3, 0xffff -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, 0xffff +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedExt: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_and_b32 s6, s3, 0xffff -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s1, 0xffff +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedExt: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_ashr_i32 s2, s0, 16 -; GFX10-DL-NEXT: s_ashr_i32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -855,57 +872,60 @@ ; GFX8-LABEL: notudot2_SameVec: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: s_and_b32 s4, s4, 0xffff -; GFX8-NEXT: v_mad_u32_u24 v0, s2, s2, v0 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: v_mad_u32_u24 v0, s1, s1, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notudot2_SameVec: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NODL-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, s2, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, s4, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, s1, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, s0, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_SameVec: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-DL-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, s2, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, s4, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, s1, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -913,19 +933,20 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, s3 -; GFX10-DL-NEXT: s_and_b32 s2, s4, 0xffff -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s1, s8 +; GFX10-DL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -982,83 +1003,85 @@ ; GFX8-LABEL: udot2_v4i16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_v4i16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1115,83 +1138,85 @@ ; GFX8-LABEL: udot2_v4i16_Hi: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x4 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x4 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_v4i16_Hi: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x4 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x4 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16_Hi: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x4 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x4 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1248,96 +1273,97 @@ ; GFX8-LABEL: notudot2_v4i16_Even: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s8, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s10, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s3, s8 -; GFX8-NEXT: s_and_b32 s2, s2, s8 -; GFX8-NEXT: s_and_b32 s5, s5, s8 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_and_b32 s1, s1, s10 +; GFX8-NEXT: s_and_b32 s0, s0, s10 +; GFX8-NEXT: s_and_b32 s5, s9, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX8-NEXT: s_and_b32 s4, s4, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: s_and_b32 s4, s8, s10 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notudot2_v4i16_Even: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-NODL-NEXT: s_and_b32 s2, s2, s8 -; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-NODL-NEXT: s_and_b32 s0, s0, s10 +; GFX9-NODL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX9-NODL-NEXT: s_and_b32 s4, s4, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: s_and_b32 s4, s8, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Even: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-DL-NEXT: s_and_b32 s2, s2, s8 -; GFX9-DL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-DL-NEXT: s_and_b32 s0, s0, s10 +; GFX9-DL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX9-DL-NEXT: s_and_b32 s4, s4, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: s_and_b32 s4, s8, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Even: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s10, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s1, s1, s7 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 -; GFX10-DL-NEXT: s_and_b32 s0, s0, s7 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s1, v0 -; GFX10-DL-NEXT: s_and_b32 s1, s2, s7 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s10 +; GFX10-DL-NEXT: s_and_b32 s1, s1, s4 +; GFX10-DL-NEXT: s_and_b32 s5, s9, s4 +; GFX10-DL-NEXT: s_and_b32 s0, s0, s4 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s1, v0 +; GFX10-DL-NEXT: s_and_b32 s1, s8, s4 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1394,96 +1420,97 @@ ; GFX8-LABEL: notudot2_v4i16_Middle: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s8, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s10, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s3, s8 -; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s5, s5, s8 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_and_b32 s1, s1, s10 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s9, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: s_lshr_b32 s4, s8, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notudot2_v4i16_Middle: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s8, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Middle: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s10, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s11, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s3, s3, s8 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s10 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s9, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: s_lshr_b32 s4, s8, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Middle: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s10, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s1, s1, s7 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s10 +; GFX10-DL-NEXT: s_and_b32 s1, s1, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s9, s4 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s1, v0 -; GFX10-DL-NEXT: s_lshr_b32 s1, s2, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s1, v0 +; GFX10-DL-NEXT: s_lshr_b32 s1, s8, 16 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1540,96 +1567,97 @@ ; GFX8-LABEL: notudot2_DiffIndex: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_and_b32 s1, s1, s8 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX8-NEXT: s_lshr_b32 s7, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notudot2_DiffIndex: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_DiffIndex: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_DiffIndex: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-DL-NEXT: s_and_b32 s6, s1, s2 -; GFX10-DL-NEXT: s_and_b32 s0, s0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX10-DL-NEXT: s_and_b32 s6, s1, s4 +; GFX10-DL-NEXT: s_and_b32 s0, s0, s4 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1687,100 +1715,101 @@ ; GFX8-LABEL: udot2_MultipleUses_add1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v1, s2, v1, v0 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v1, s5, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v0 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v0 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v0 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX10-DL-NEXT: s_and_b32 s0, s0, s6 ; GFX10-DL-NEXT: s_and_b32 s1, s1, s6 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1838,96 +1867,100 @@ ; GFX8-LABEL: idot2_MultipleUses_add1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s6, s3 -; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX8-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 16 +; GFX8-NEXT: s_sext_i32_i16 s5, s1 +; GFX8-NEXT: s_ashr_i32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v1, s5, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v0 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_ashr_i32 s2, s0, 16 -; GFX10-DL-NEXT: s_ashr_i32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s1, s0, v0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1987,100 +2020,101 @@ ; GFX8-LABEL: udot2_MultipleUses_mul1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v2, v0 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v2, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v2, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v2, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v2, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v2, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s3, s0, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s0, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s1, s4 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2139,96 +2173,100 @@ ; GFX8-LABEL: idot2_MultipleUses_mul1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_sext_i32_i16 s6, s3 -; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX8-NEXT: v_mad_i32_i24 v0, s3, v2, v0 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_sext_i32_i16 s5, s1 +; GFX8-NEXT: s_ashr_i32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_ashr_i32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, s1, v2, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v2, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v2, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v2, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s1, v2, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_sext_i32_i16 s2, s0 -; GFX10-DL-NEXT: s_sext_i32_i16 s3, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i16 s5, s1 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 16 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s4, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2289,100 +2327,101 @@ ; GFX8-LABEL: udot2_MultipleUses_mul2: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_lshr_b32 s4, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s9, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX10-DL-NEXT: s_and_b32 s0, s0, s6 ; GFX10-DL-NEXT: s_and_b32 s1, s1, s6 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2441,96 +2480,100 @@ ; GFX8-LABEL: idot2_MultipleUses_mul2: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s6, s3 -; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 16 +; GFX8-NEXT: s_sext_i32_i16 s5, s1 +; GFX8-NEXT: s_ashr_i32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s1, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_ashr_i32 s2, s0, 16 -; GFX10-DL-NEXT: s_ashr_i32 s3, s1, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 +; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2589,14 +2632,14 @@ ; GFX8-LABEL: udot2_acc16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s3, s2, s0 ; GFX8-NEXT: s_lshr_b32 s2, s2, 16 @@ -2613,14 +2656,14 @@ ; GFX9-NODL-LABEL: udot2_acc16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 @@ -2637,17 +2680,18 @@ ; GFX9-DL-LABEL: udot2_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2661,10 +2705,10 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2722,83 +2766,83 @@ ; GFX8-LABEL: notsdot2_sext8: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: flat_load_ushort v1, v[2:3] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: flat_load_ushort v4, v[0:1] +; GFX8-NEXT: flat_load_ushort v5, v[2:3] +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0) -; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 -; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v5 +; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX8-NEXT: v_bfe_i32 v0, v4, 0, 8 +; GFX8-NEXT: v_bfe_i32 v1, v5, 0, 8 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, v1, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notsdot2_sext8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NODL-NEXT: global_load_ushort v1, v[2:3], off -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off +; GFX9-NODL-NEXT: global_load_ushort v5, v[2:3], off +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_bfe_i32 v2, v0, 0, 8 -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v4 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 -; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v3, 8, v5 +; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v0, v4, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v1, v5, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v1, v0, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v2, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v1, v0, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notsdot2_sext8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-DL-NEXT: global_load_ushort v1, v[2:3], off -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off +; GFX9-DL-NEXT: global_load_ushort v5, v[2:3], off +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_i32 v2, v0, 0, 8 -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 -; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v3, 8, v5 +; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v0, v4, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v1, v5, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v0, v1, v0, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v2, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v1, v0, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2806,29 +2850,29 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-DL-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-DL-NEXT: global_load_ushort v1, v[2:3], off -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_ushort v4, v[0:1], off +; GFX10-DL-NEXT: global_load_ushort v5, v[2:3], off +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, v0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, v5 +; GFX10-DL-NEXT: v_bfe_i32 v2, v4, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v3, v5, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, v3, v2, s2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, v1, v0, v2 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v1, v0, s0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i8> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -41,97 +41,101 @@ ; GFX8-LABEL: idot4_acc32: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s4, s2 -; GFX8-NEXT: s_sext_i32_i8 s5, s3 -; GFX8-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX8-NEXT: s_sext_i32_i8 s4, s0 +; GFX8-NEXT: s_sext_i32_i8 s5, s1 +; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX8-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX8-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_ashr_i32 s3, s3, 24 +; GFX8-NEXT: s_ashr_i32 s1, s1, 24 ; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -219,10 +223,10 @@ ; GFX8-LABEL: idot4_acc16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -250,10 +254,10 @@ ; GFX9-NODL-LABEL: idot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -281,17 +285,18 @@ ; GFX9-DL-LABEL: idot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -305,10 +310,10 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -386,14 +391,14 @@ ; GFX8-LABEL: idot4_acc8: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: s_and_b32 s3, s2, s0 @@ -418,14 +423,14 @@ ; GFX9-NODL-LABEL: idot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 @@ -450,17 +455,18 @@ ; GFX9-DL-LABEL: idot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -474,10 +480,10 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -547,126 +553,130 @@ ; GFX8-LABEL: idot4_multiuse_mul1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s4, s2 -; GFX8-NEXT: s_sext_i32_i8 s5, s3 -; GFX8-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX8-NEXT: s_sext_i32_i8 s4, s0 +; GFX8-NEXT: s_sext_i32_i8 s5, s1 +; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX8-NEXT: v_mad_i32_i24 v1, s4, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX8-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX8-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX8-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_ashr_i32 s3, s3, 24 +; GFX8-NEXT: s_ashr_i32 s1, s1, 24 ; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v0, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2 -; GFX9-DL-NEXT: s_sext_i32_i8 s5, s3 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-DL-NEXT: s_sext_i32_i8 s5, s1 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x80008 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x80010 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x80010 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_sext_i32_i8 s2, s0 -; GFX10-DL-NEXT: s_sext_i32_i8 s3, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s1 ; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x80008 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80010 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x80010 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -746,96 +756,99 @@ ; GFX8-LABEL: idot4_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX8-NEXT: s_ashr_i32 s6, s3, 24 -; GFX8-NEXT: s_bfe_i32 s7, s3, 0x80010 -; GFX8-NEXT: s_sext_i32_i8 s3, s3 -; GFX8-NEXT: s_ashr_i32 s4, s2, 24 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80010 -; GFX8-NEXT: s_sext_i32_i8 s2, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX8-NEXT: s_ashr_i32 s6, s1, 24 +; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX8-NEXT: s_sext_i32_i8 s1, s1 +; GFX8-NEXT: s_ashr_i32 s4, s0, 24 +; GFX8-NEXT: s_bfe_i32 s5, s0, 0x80010 +; GFX8-NEXT: s_sext_i32_i8 s0, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s8 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3 ; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mad_i32_i24 v0, s5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: v_mad_i32_i24 v2, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s6, s3, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s4, s2, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s6, s1, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s1 +; GFX9-NODL-NEXT: s_ashr_i32 s4, s0, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s5, s0, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX9-DL-NEXT: s_ashr_i32 s6, s3, 24 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80010 -; GFX9-DL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-DL-NEXT: s_ashr_i32 s4, s2, 24 -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x80010 -; GFX9-DL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX9-DL-NEXT: s_ashr_i32 s6, s1, 24 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s1, s1 +; GFX9-DL-NEXT: s_ashr_i32 s4, s0, 24 +; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -843,30 +856,31 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: s_sext_i32_i8 s4, s2 -; GFX10-DL-NEXT: s_sext_i32_i8 s5, s3 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s1 ; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x80010 +; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -929,10 +943,10 @@ ; GFX8-LABEL: idot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -961,14 +975,14 @@ ; GFX9-NODL-LABEL: idot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v3, 8, s5 ; GFX9-NODL-NEXT: s_bfe_i32 s5, s5, 0x80000 ; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v2, 8, s4 @@ -977,18 +991,18 @@ ; GFX9-NODL-NEXT: v_lshl_or_b32 v3, v3, 16, v5 ; GFX9-NODL-NEXT: v_and_b32_e32 v5, s4, v4 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 -; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v0, 8, s2 +; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s1, s1, 0x80000 +; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v0, 8, s0 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 -; GFX9-NODL-NEXT: v_and_b32_e32 v3, s3, v4 -; GFX9-NODL-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX9-NODL-NEXT: v_and_b32_e32 v3, s1, v4 +; GFX9-NODL-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 -; GFX9-NODL-NEXT: v_and_b32_e32 v3, s2, v4 +; GFX9-NODL-NEXT: v_and_b32_e32 v3, s0, v4 ; GFX9-NODL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v0, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u32_e32 v4, v3, v4 @@ -1001,14 +1015,14 @@ ; GFX9-DL-LABEL: idot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX9-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s5 ; GFX9-DL-NEXT: s_bfe_i32 s5, s5, 0x80000 ; GFX9-DL-NEXT: v_ashrrev_i16_e64 v2, 8, s4 @@ -1017,18 +1031,18 @@ ; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v5 ; GFX9-DL-NEXT: v_and_b32_e32 v5, s4, v4 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 -; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s3 -; GFX9-DL-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX9-DL-NEXT: v_ashrrev_i16_e64 v0, 8, s2 +; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s1 +; GFX9-DL-NEXT: s_bfe_i32 s1, s1, 0x80000 +; GFX9-DL-NEXT: v_ashrrev_i16_e64 v0, 8, s0 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s3, v4 -; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s1, v4 +; GFX9-DL-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v4 ; GFX9-DL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v4, v3, v4 @@ -1049,28 +1063,28 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s0 -; GFX10-DL-NEXT: s_bfe_i32 s0, s0, 0x80000 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000 -; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v3 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s1 -; GFX10-DL-NEXT: v_and_b32_e32 v6, s3, v3 -; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 16 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 8, s2 +; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x80000 +; GFX10-DL-NEXT: s_bfe_i32 s2, s5, 0x80000 +; GFX10-DL-NEXT: v_and_b32_e32 v7, s1, v3 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s4 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s5 +; GFX10-DL-NEXT: v_and_b32_e32 v6, s2, v3 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 16 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 16 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 -; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x80000 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 8, s0 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x80000 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s0 +; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x80000 +; GFX10-DL-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX10-DL-NEXT: v_and_b32_e32 v7, s2, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s1, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v7 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s1 ; GFX10-DL-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v7 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -42,99 +42,101 @@ ; GFX8-LABEL: udot4_acc32: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s2, 0xff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s5, s3, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX8-NEXT: v_mad_u32_u24 v0, s5, v0, v1 -; GFX8-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX8-NEXT: v_mad_u32_u24 v0, s4, v0, v1 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX8-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_lshr_b32 s4, s4, 24 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX8-NEXT: s_lshr_b32 s3, s3, 24 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v0, v1 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v0, v1 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -213,14 +215,14 @@ ; GFX8-LABEL: udot4_acc16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s3, s1, s0 ; GFX8-NEXT: s_and_b32 s0, s2, s0 @@ -245,14 +247,14 @@ ; GFX9-NODL-LABEL: udot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 ; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 @@ -277,17 +279,18 @@ ; GFX9-DL-LABEL: udot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -301,10 +304,10 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -383,14 +386,14 @@ ; GFX8-LABEL: udot4_acc8: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: s_and_b32 s3, s2, s0 @@ -415,14 +418,14 @@ ; GFX9-NODL-LABEL: udot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 @@ -447,17 +450,18 @@ ; GFX9-DL-LABEL: udot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -471,10 +475,10 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -538,14 +542,14 @@ ; GFX8-LABEL: udot2_8: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s3, s2, s0 ; GFX8-NEXT: s_and_b32 s0, s1, s0 @@ -562,14 +566,14 @@ ; GFX9-NODL-LABEL: udot2_8: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 ; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 @@ -586,14 +590,14 @@ ; GFX9-DL-LABEL: udot2_8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_and_b32 s3, s2, s0 ; GFX9-DL-NEXT: s_and_b32 s0, s1, s0 @@ -617,17 +621,17 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s1, 0xff +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s0, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s3, s2, s1 -; GFX10-DL-NEXT: s_and_b32 s1, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX10-DL-NEXT: s_and_b32 s1, s5, s0 +; GFX10-DL-NEXT: s_and_b32 s0, s4, s0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x80008 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s5, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x80008 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -687,14 +691,14 @@ ; GFX8-LABEL: udot4_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s3, s1, s0 ; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 @@ -719,14 +723,14 @@ ; GFX9-NODL-LABEL: udot4_CommutationInsideMAD: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 ; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 @@ -751,17 +755,18 @@ ; GFX9-DL-LABEL: udot4_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s3, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s1, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -775,10 +780,10 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s1, s0, v2 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s5, s4, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -850,14 +855,14 @@ ; GFX8-LABEL: udot4_CommutationAccrossMADs: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX8-NEXT: s_and_b32 s3, s1, s0 @@ -882,14 +887,14 @@ ; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 @@ -914,14 +919,14 @@ ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX9-DL-NEXT: s_and_b32 s3, s1, s0 @@ -946,7 +951,6 @@ ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s4, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -954,21 +958,22 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 -; GFX10-DL-NEXT: s_and_b32 s2, s0, s4 -; GFX10-DL-NEXT: s_and_b32 s3, s1, s4 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 +; GFX10-DL-NEXT: s_and_b32 s0, s4, s2 +; GFX10-DL-NEXT: s_and_b32 s1, s5, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1041,130 +1046,131 @@ ; GFX8-LABEL: udot4_multiuse_mul1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s2, 0xff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s5, s3, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_and_b32 s5, s1, s8 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX8-NEXT: v_mad_u32_u24 v1, s5, v0, v1 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX8-NEXT: v_mad_u32_u24 v1, s4, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x80010 ; GFX8-NEXT: v_mad_u32_u24 v1, s6, v2, v1 -; GFX8-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX8-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX8-NEXT: v_mad_u32_u24 v0, s4, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_lshr_b32 s4, s4, 24 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX8-NEXT: s_lshr_b32 s3, s3, 24 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v0, v1 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v0, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v0, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v0, v1 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x80010 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 -; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x80010 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s4, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s3, s0, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s0, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s1, s4 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1246,134 +1252,135 @@ ; GFX8-LABEL: udot4_multiuse_add1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s2, 0xff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s5, s3, s2 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX8-NEXT: s_and_b32 s5, s1, s8 ; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 ; GFX8-NEXT: v_mad_u32_u24 v0, s6, v0, v1 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX8-NEXT: s_and_b32 s4, s0, s8 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s10, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, s5, v2, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s4, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: s_lshr_b32 s4, s4, 24 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: v_mad_u32_u24 v0, s8, v2, v0 -; GFX8-NEXT: s_lshr_b32 s3, s3, 24 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mad_u32_u24 v0, s3, v2, v0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mad_u32_u24 v0, s0, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_multiuse_add1: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s5, s1, s8 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v0, v1 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX9-NODL-NEXT: v_add_u32_e32 v1, s10, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v2, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v2, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, v2, v0 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s0, v2, v0 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX9-DL-NEXT: s_and_b32 s5, s1, s8 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s6, v0, v1 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-DL-NEXT: s_and_b32 s4, s0, s8 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x80010 ; GFX9-DL-NEXT: v_add_u32_e32 v1, s10, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v2, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s8, v2, v0 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, v2, v0 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s0, v2, v0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: s_movk_i32 s7, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s6, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 -; GFX10-DL-NEXT: s_and_b32 s2, s0, s7 -; GFX10-DL-NEXT: s_and_b32 s3, s1, s7 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v0 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: s_and_b32 s4, s0, s6 +; GFX10-DL-NEXT: s_and_b32 s5, s1, s6 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s5, v0 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s5, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s8, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1455,10 +1462,10 @@ ; GFX8-LABEL: notdot4_mixedtypes: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1486,10 +1493,10 @@ ; GFX9-NODL-LABEL: notdot4_mixedtypes: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1517,10 +1524,10 @@ ; GFX9-DL-LABEL: notdot4_mixedtypes: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1555,21 +1562,21 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_sext_i32_i8 s2, s0 -; GFX10-DL-NEXT: s_sext_i32_i8 s3, s1 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_sext_i32_i8 s0, s4 +; GFX10-DL-NEXT: s_sext_i32_i8 s1, s5 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1651,96 +1658,96 @@ ; GFX8-LABEL: udot4_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s2, 0xff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s5, s3, 24 -; GFX8-NEXT: s_lshr_b32 s6, s4, 24 -; GFX8-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s3 -; GFX8-NEXT: s_and_b32 s3, s3, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 8, s4 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: s_lshr_b32 s4, s0, 24 +; GFX8-NEXT: s_lshr_b32 s5, s1, 24 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX8-NEXT: s_and_b32 s0, s0, s8 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX8-NEXT: s_and_b32 s1, s1, s8 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v2, v3 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: v_mad_u32_u24 v0, s7, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s8, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 24 -; GFX9-NODL-NEXT: s_lshr_b32 s6, s4, 24 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s3 -; GFX9-NODL-NEXT: s_and_b32 s3, s3, s2 -; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s4 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX9-NODL-NEXT: s_and_b32 s0, s0, s8 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX9-NODL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v2, v3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v2, v3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s7, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s8, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 24 -; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 24 -; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s3 -; GFX9-DL-NEXT: s_and_b32 s3, s3, s2 -; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s4 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s0 +; GFX9-DL-NEXT: s_and_b32 s0, s0, s8 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s1 +; GFX9-DL-NEXT: s_and_b32 s1, s1, s8 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v2, v3 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s7, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1748,32 +1755,33 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s6, 0xff -; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s5, 0xff +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: s_and_b32 s4, s2, s6 -; GFX10-DL-NEXT: s_and_b32 s6, s3, s6 -; GFX10-DL-NEXT: v_and_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_and_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-DL-NEXT: s_and_b32 s6, s0, s5 +; GFX10-DL-NEXT: s_and_b32 s5, s1, s5 +; GFX10-DL-NEXT: v_and_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s6, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1838,14 +1846,14 @@ ; GFX8-LABEL: udot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s6, s1, s0 ; GFX8-NEXT: s_and_b32 s0, s2, s0 @@ -1869,30 +1877,30 @@ ; GFX9-NODL-LABEL: udot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s7, s1, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-NODL-NEXT: v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_lshl_or_b32 v3, s6, 16, v3 ; GFX9-NODL-NEXT: v_lshl_or_b32 v4, s4, 16, v4 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v4, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-NODL-NEXT: v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v0, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 ; GFX9-NODL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v0, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u32_e32 v4, v2, v4 @@ -1905,30 +1913,30 @@ ; GFX9-DL-LABEL: udot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 24 +; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-DL-NEXT: v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_lshl_or_b32 v3, s6, 16, v3 ; GFX9-DL-NEXT: v_lshl_or_b32 v4, s4, 16, v4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v4, v3 -; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-DL-NEXT: v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s1 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX9-DL-NEXT: v_and_b32_sdwa v0, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 ; GFX9-DL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v0, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v4, v2, v4 @@ -1949,24 +1957,24 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s0 -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s1 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_lshr_b32 s2, s1, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s4 +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v3, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s5 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: s_lshr_b32 s0, s5, 16 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 16 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v3, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 24 +; GFX10-DL-NEXT: s_lshr_b32 s0, s5, 24 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, s1, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v3, s0, 16, v3 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, s0, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v3, s1, 16, v3 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 @@ -2050,13 +2058,13 @@ ; GFX8-LABEL: udot4_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2090,29 +2098,30 @@ ; GFX9-NODL-LABEL: udot4_acc8_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_nop 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v0, s0, v0 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v1, s0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NODL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 ; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NODL-NEXT: v_or_b32_e32 v3, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v5, v[0:1], off ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) @@ -2126,29 +2135,30 @@ ; GFX9-DL-LABEL: udot4_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v0, s0, v0 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, s0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-DL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 24 +; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 ; GFX9-DL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v5, v[0:1], off ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) @@ -2169,18 +2179,18 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 24 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s3 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s4 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s5 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 24 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s0, s1 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s4, s5 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 16 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 16 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v5 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -59,129 +59,133 @@ ; GFX8-LABEL: idot8_acc32: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s5, s3, 0x40000 -; GFX8-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX8-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40004 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 ; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40004 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX8-NEXT: s_bfe_i32 s11, s1, 0x4000c ; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX8-NEXT: s_bfe_i32 s10, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 -; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 -; GFX8-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX8-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX8-NEXT: s_bfe_i32 s17, s1, 0x40018 ; GFX8-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v1, s15 -; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40018 ; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NEXT: s_ashr_i32 s3, s3, 28 +; GFX8-NEXT: s_ashr_i32 s1, s1, 28 ; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 -; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 28 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_acc32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 -; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x4000c ; GFX9-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mad_i32_i24 v2, s0, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -322,10 +326,10 @@ ; GFX8-LABEL: idot8_acc16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -373,10 +377,10 @@ ; GFX9-LABEL: idot8_acc16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -424,10 +428,10 @@ ; GFX9-DL-LABEL: idot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -482,41 +486,41 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 -; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3 -; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s8, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 12 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 12 +; GFX10-DL-NEXT: s_bfe_i32 s2, s4, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s3, s5, 0x40000 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s8, s5, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s0, s5, 0x40004 ; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s7, s8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s2, v2 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s0, v2 +; GFX10-DL-NEXT: s_mov_b32 s0, 0xffff +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40010 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 -; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40018 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_ashr_i32 s0, s4, 28 +; GFX10-DL-NEXT: s_ashr_i32 s1, s5, 28 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -657,14 +661,14 @@ ; GFX8-LABEL: idot8_acc8: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s3, s1, 12 ; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40000 @@ -711,14 +715,14 @@ ; GFX9-LABEL: idot8_acc8: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s1, 12 ; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000 @@ -765,14 +769,14 @@ ; GFX9-DL-LABEL: idot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 12 ; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 @@ -826,41 +830,41 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 -; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3 -; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s8, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 12 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 12 +; GFX10-DL-NEXT: s_bfe_i32 s2, s4, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s3, s5, 0x40000 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s8, s5, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s0, s5, 0x40004 ; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s7, s8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s2, v2 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s0, v2 +; GFX10-DL-NEXT: s_movk_i32 s0, 0xff +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40010 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 -; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40018 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_ashr_i32 s0, s4, 28 +; GFX10-DL-NEXT: s_ashr_i32 s1, s5, 28 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -987,190 +991,194 @@ ; GFX8-LABEL: idot8_multiuses_mul1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX8-NEXT: s_bfe_i32 s5, s1, 0x40000 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 ; GFX8-NEXT: v_mad_i32_i24 v1, s4, v0, v1 -; GFX8-NEXT: s_bfe_i32 s7, s3, 0x40004 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX8-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NEXT: v_mad_i32_i24 v0, s6, v2, v0 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX8-NEXT: s_bfe_i32 s11, s1, 0x4000c ; GFX8-NEXT: v_mad_i32_i24 v0, s8, v2, v0 -; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX8-NEXT: s_bfe_i32 s10, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v2, s11 -; GFX8-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX8-NEXT: v_mad_i32_i24 v0, s10, v2, v0 -; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 -; GFX8-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX8-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX8-NEXT: s_bfe_i32 s17, s1, 0x40018 ; GFX8-NEXT: v_mad_i32_i24 v0, s12, v2, v0 -; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v2, s15 -; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40018 ; GFX8-NEXT: v_mad_i32_i24 v0, s14, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s17 -; GFX8-NEXT: s_ashr_i32 s3, s3, 28 +; GFX8-NEXT: s_ashr_i32 s1, s1, 28 ; GFX8-NEXT: v_mad_i32_i24 v0, s16, v2, v0 -; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 28 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mad_i32_i24 v0, s0, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000 ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: v_mad_i32_i24 v1, s4, v0, v1 -; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: v_mad_i32_i24 v0, s6, v2, v0 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x4000c ; GFX9-NEXT: v_mad_i32_i24 v0, s8, v2, v0 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v2, s11 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-NEXT: v_mad_i32_i24 v0, s10, v2, v0 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v0, s12, v2, v0 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v2, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 ; GFX9-NEXT: v_mad_i32_i24 v0, s14, v2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: v_mad_i32_i24 v0, s16, v2, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mad_i32_i24 v0, s0, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v0, v1 -; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x4000c ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 -; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40010 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40018 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s15 -; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v2, v0 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s0, v2, v0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40000 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v0 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40008 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v0 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40004 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x4000c +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40018 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s5, v1 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1295,31 +1303,32 @@ ; GFX8-LABEL: idot8_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s4, s2, 28 -; GFX8-NEXT: s_ashr_i32 s11, s3, 28 -; GFX8-NEXT: s_bfe_i32 s12, s3, 0x40018 -; GFX8-NEXT: s_bfe_i32 s13, s3, 0x40014 -; GFX8-NEXT: s_bfe_i32 s14, s3, 0x40010 -; GFX8-NEXT: s_bfe_i32 s15, s3, 0x4000c -; GFX8-NEXT: s_bfe_i32 s16, s3, 0x40008 -; GFX8-NEXT: s_bfe_i32 s17, s3, 0x40004 -; GFX8-NEXT: s_bfe_i32 s3, s3, 0x40000 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40018 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40014 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x4000c -; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX8-NEXT: s_bfe_i32 s10, s2, 0x40004 -; GFX8-NEXT: s_bfe_i32 s2, s2, 0x40000 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: s_ashr_i32 s4, s0, 28 +; GFX8-NEXT: s_ashr_i32 s11, s1, 28 +; GFX8-NEXT: s_bfe_i32 s12, s1, 0x40018 +; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40014 +; GFX8-NEXT: s_bfe_i32 s14, s1, 0x40010 +; GFX8-NEXT: s_bfe_i32 s15, s1, 0x4000c +; GFX8-NEXT: s_bfe_i32 s16, s1, 0x40008 +; GFX8-NEXT: s_bfe_i32 s17, s1, 0x40004 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x40000 +; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40018 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40014 +; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40010 +; GFX8-NEXT: s_bfe_i32 s8, s0, 0x4000c +; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX8-NEXT: s_bfe_i32 s10, s0, 0x40004 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x40000 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: v_mad_i32_i24 v0, s2, v0, v1 +; GFX8-NEXT: v_mad_i32_i24 v0, s0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 @@ -1334,39 +1343,40 @@ ; GFX8-NEXT: v_mad_i32_i24 v0, s5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: v_mad_i32_i24 v2, s4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s2, 28 -; GFX9-NEXT: s_ashr_i32 s11, s3, 28 -; GFX9-NEXT: s_bfe_i32 s12, s3, 0x40018 -; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40014 -; GFX9-NEXT: s_bfe_i32 s14, s3, 0x40010 -; GFX9-NEXT: s_bfe_i32 s15, s3, 0x4000c -; GFX9-NEXT: s_bfe_i32 s16, s3, 0x40008 -; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40004 -; GFX9-NEXT: s_bfe_i32 s3, s3, 0x40000 -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s2, s2, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_ashr_i32 s4, s0, 28 +; GFX9-NEXT: s_ashr_i32 s11, s1, 28 +; GFX9-NEXT: s_bfe_i32 s12, s1, 0x40018 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40010 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x4000c +; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40008 +; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s1, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40014 +; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40010 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x4000c +; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s0, s0, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_i32_i24 v0, s2, v0, v1 +; GFX9-NEXT: v_mad_i32_i24 v0, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 @@ -1381,43 +1391,45 @@ ; GFX9-NEXT: v_mad_i32_i24 v0, s5, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-NEXT: v_mad_i32_i24 v2, s4, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1518,10 +1530,10 @@ ; GFX8-LABEL: idot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1566,26 +1578,28 @@ ; GFX9-LABEL: idot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_and_b32 s11, s2, 15 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10 -; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s8 -; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 +; GFX9-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-NEXT: s_and_b32 s11, s0, 15 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s9, s10 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s5, s8 +; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s4 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010 @@ -1594,27 +1608,27 @@ ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-NEXT: s_and_b32 s17, s6, 15 ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s17, s6 -; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s15, s16 -; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s13, s14 +; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s17, s6 +; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s15, s16 +; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s13, s14 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v1, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6 ; GFX9-NEXT: global_load_ushort v6, v[0:1], off -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s12 -; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s7, s12 +; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 @@ -1633,26 +1647,28 @@ ; GFX9-DL-LABEL: idot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s5, s8 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s3, s4 +; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s11, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s9, s10 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s8 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s1, s4 ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010 @@ -1661,27 +1677,27 @@ ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s17, s6 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s15, s16 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s13, s14 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s17, s6 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s15, s16 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s13, s14 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v1, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v0, v4 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v6 ; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s7, s12 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s7, s12 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v7 @@ -1707,52 +1723,52 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 28 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX10-DL-NEXT: s_and_b32 s8, s0, 15 -; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s9, s1, 15 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40004 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s9, s8 -; GFX10-DL-NEXT: s_bfe_u32 s9, s1, 0x4000c -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 28 +; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s3, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s8, s4, 15 +; GFX10-DL-NEXT: s_bfe_u32 s4, s4, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s9, s5, 15 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s8, s4 +; GFX10-DL-NEXT: s_bfe_u32 s8, s5, 0x40004 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s4 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s9, s8 +; GFX10-DL-NEXT: s_bfe_u32 s9, s5, 0x4000c +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s4 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s4, s5, 0x40008 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s9 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s4 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s6, s5, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s5, 0x40014 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s4 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s3, s6, s4 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s3 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s2, s3 +; GFX10-DL-NEXT: s_bfe_u32 s8, s5, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s2, s5, 28 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s8, s2 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] @@ -1884,14 +1900,14 @@ ; GFX8-LABEL: idot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40004 ; GFX8-NEXT: s_bfe_i32 s9, s1, 0x4000c @@ -1954,14 +1970,14 @@ ; GFX9-LABEL: idot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s1, 4 ; GFX9-NEXT: s_lshr_b32 s14, s2, 4 @@ -2042,14 +2058,14 @@ ; GFX9-DL-LABEL: idot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 4 ; GFX9-DL-NEXT: s_lshr_b32 s14, s2, 4 @@ -2137,24 +2153,24 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 4 -; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 4 +; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 4 +; GFX10-DL-NEXT: s_lshr_b32 s14, s5, 4 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s14 -; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 12 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX10-DL-NEXT: s_lshr_b32 s8, s4, 12 +; GFX10-DL-NEXT: s_lshr_b32 s15, s5, 12 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13 -; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 8 -; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 8 +; GFX10-DL-NEXT: s_lshr_b32 s9, s4, 8 +; GFX10-DL-NEXT: s_lshr_b32 s16, s5, 8 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s9 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 @@ -2167,31 +2183,31 @@ ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v7 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v19, v14 -; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 20 -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s10, s1, 20 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 20 +; GFX10-DL-NEXT: s_lshr_b32 s2, s4, 16 +; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 24 +; GFX10-DL-NEXT: s_lshr_b32 s10, s5, 20 ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s3 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s1 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s10 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v12 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 -; GFX10-DL-NEXT: s_lshr_b32 s11, s1, 16 -; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 28 +; GFX10-DL-NEXT: s_lshr_b32 s11, s5, 16 +; GFX10-DL-NEXT: s_lshr_b32 s12, s5, 28 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s11 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v9 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v10 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s12 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v11 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v13 -; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 24 +; GFX10-DL-NEXT: s_lshr_b32 s13, s5, 24 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s13 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v16 @@ -2210,7 +2226,7 @@ ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -57,11 +57,13 @@ ; GFX8-LABEL: udot8_acc32: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 ; GFX8-NEXT: s_bfe_u32 s12, s6, 0x40018 @@ -71,17 +73,17 @@ ; GFX8-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX8-NEXT: s_and_b32 s6, s6, 15 -; GFX8-NEXT: s_lshr_b32 s3, s2, 28 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX8-NEXT: s_and_b32 s2, s2, 15 +; GFX8-NEXT: s_lshr_b32 s1, s0, 28 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 @@ -95,20 +97,22 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s12 ; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 @@ -118,17 +122,17 @@ ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 @@ -142,44 +146,46 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -303,10 +309,10 @@ ; GFX8-LABEL: udot8_acc16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -351,10 +357,10 @@ ; GFX9-LABEL: udot8_acc16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -399,10 +405,10 @@ ; GFX9-DL-LABEL: udot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -454,34 +460,34 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_and_b32 s0, s4, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s5, 15 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40008 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -606,10 +612,10 @@ ; GFX8-LABEL: udot8_acc8: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -654,10 +660,10 @@ ; GFX9-LABEL: udot8_acc8: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -702,10 +708,10 @@ ; GFX9-DL-LABEL: udot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -757,34 +763,34 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_and_b32 s0, s4, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s5, 15 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40008 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -910,10 +916,10 @@ ; GFX8-LABEL: udot8_acc4: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -961,10 +967,10 @@ ; GFX9-LABEL: udot8_acc4: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1012,10 +1018,10 @@ ; GFX9-DL-LABEL: udot8_acc4: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1070,36 +1076,36 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s0, s4, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s5, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s3, s5 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x4000c +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s1, s3 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s2, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1210,10 +1216,10 @@ ; GFX8-LABEL: udot8_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1261,10 +1267,10 @@ ; GFX9-LABEL: udot8_CommutationInsideMAD: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1312,10 +1318,10 @@ ; GFX9-DL-LABEL: udot8_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -1370,36 +1376,36 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40008 +; GFX10-DL-NEXT: s_and_b32 s0, s4, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s5, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x4000c +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s0, s1 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s2, s3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1509,11 +1515,13 @@ ; GFX8-LABEL: udot8_multiuses_mul1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 @@ -1523,19 +1531,19 @@ ; GFX8-NEXT: s_bfe_u32 s15, s6, 0x4000c ; GFX8-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX8-NEXT: s_and_b32 s6, s6, 15 -; GFX8-NEXT: s_lshr_b32 s3, s2, 28 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX8-NEXT: s_and_b32 s2, s2, 15 +; GFX8-NEXT: s_lshr_b32 s1, s0, 28 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v1, s0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s17 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX8-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mad_u32_u24 v1, s10, v2, v1 @@ -1548,21 +1556,23 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX8-NEXT: v_mad_u32_u24 v1, s1, v2, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 @@ -1572,19 +1582,19 @@ ; GFX9-NEXT: s_bfe_u32 s15, s6, 0x4000c ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1 @@ -1597,21 +1607,23 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s1, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-DL-NEXT: s_lshr_b32 s7, s6, 28 @@ -1621,19 +1633,19 @@ ; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x4000c ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-DL-NEXT: s_and_b32 s6, s6, 15 -; GFX9-DL-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 +; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v2, v1 @@ -1646,34 +1658,35 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, v2, v1 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: s_and_b32 s4, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s5, s1, 15 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v0 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c @@ -1691,8 +1704,8 @@ ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1816,11 +1829,13 @@ ; GFX8-LABEL: udot8_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 ; GFX8-NEXT: s_bfe_u32 s12, s6, 0x40018 @@ -1830,17 +1845,17 @@ ; GFX8-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX8-NEXT: s_and_b32 s6, s6, 15 -; GFX8-NEXT: s_lshr_b32 s3, s2, 28 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX8-NEXT: s_and_b32 s2, s2, 15 +; GFX8-NEXT: s_lshr_b32 s1, s0, 28 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 @@ -1854,20 +1869,22 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s12 ; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 @@ -1877,17 +1894,17 @@ ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s3, s2, 28 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: s_lshr_b32 s1, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 @@ -1901,44 +1918,46 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s0, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2037,10 +2056,10 @@ ; GFX8-LABEL: udot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2085,45 +2104,47 @@ ; GFX9-LABEL: udot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX9-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 ; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010 ; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, s3, v0 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s13, s14 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, s1, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s13, s14 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40014 ; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-NEXT: s_and_b32 s17, s6, 15 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s8 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s15, s16 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s15, s16 ; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v0 -; GFX9-NEXT: s_and_b32 s11, s2, 15 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s11, s0, 15 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s9, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s17, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s17, s6 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, s4, v0 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v6, v5, v6 @@ -2140,45 +2161,47 @@ ; GFX9-DL-LABEL: udot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_nop 0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX9-DL-NEXT: s_bfe_u32 s3, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 ; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010 ; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s3, v0 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s13, s14 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s1, v0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s13, s14 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40014 ; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40008 ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s5, s8 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x4000c ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s15, s16 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s15, s16 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v0 -; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: s_and_b32 s11, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s9, s10 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s17, s6 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s17, s6 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s4, v0 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s0, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v6, v5, v6 @@ -2202,42 +2225,42 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40004 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s3 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-DL-NEXT: s_and_b32 s0, s4, 15 +; GFX10-DL-NEXT: s_bfe_u32 s3, s4, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s1, s5, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x40004 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x40008 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s1 +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x4000c ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40014 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s2, s4 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40014 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x40014 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s3, s1 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s3, s5, 28 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s4 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x40018 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 @@ -2355,14 +2378,14 @@ ; GFX8-LABEL: udot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40004 ; GFX8-NEXT: s_bfe_u32 s9, s1, 0x4000c @@ -2425,14 +2448,14 @@ ; GFX9-LABEL: udot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s3, s1, 0x40010 ; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 @@ -2491,14 +2514,14 @@ ; GFX9-DL-LABEL: udot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 ; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 @@ -2564,45 +2587,45 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s3, s5 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s2, s3 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s5 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s0, s4, 15 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s1, s3 +; GFX10-DL-NEXT: s_and_b32 s1, s5, 15 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x4000c +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s3 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s2, s1, 0x40008 -; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s4, s2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s5, 0x40008 +; GFX10-DL-NEXT: s_mov_b32 s1, 0xffff +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s0 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v3 +; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s5, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s1, v3 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s6 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s6 +; GFX10-DL-NEXT: s_bfe_u32 s3, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s7, s5, 0x40010 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 28 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s7 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s4 +; GFX10-DL-NEXT: s_lshr_b32 s2, s5, 28 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s0, s7 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s4, s2 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4 -; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40018 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, s5, s0 +; GFX10-DL-NEXT: s_bfe_u32 s0, s5, 0x40018 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, s3, s0 ; GFX10-DL-NEXT: v_or_b32_e32 v5, v6, v5 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 8, v7 -; GFX10-DL-NEXT: v_and_b32_e32 v5, s3, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v5, s1, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v10, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -2703,10 +2726,10 @@ ; GFX8-LABEL: udot8_acc4_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2754,10 +2777,10 @@ ; GFX9-LABEL: udot8_acc4_vecMul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2805,10 +2828,10 @@ ; GFX9-DL-LABEL: udot8_acc4_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2863,36 +2886,36 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s0, s4, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s5, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s3, s5 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x4000c +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s1, s3 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s2, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off @@ -2978,33 +3001,34 @@ ; GFX8-LABEL: udot8_variant1: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s4, s2, 15 -; GFX8-NEXT: s_and_b32 s5, s3, 15 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s16, s2, 0x40018 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: s_and_b32 s4, s0, 15 +; GFX8-NEXT: s_and_b32 s5, s1, 15 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s16, s0, 0x40018 +; GFX8-NEXT: s_lshr_b32 s0, s0, 28 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 ; GFX8-NEXT: v_mad_u32_u24 v0, s5, v0, v1 -; GFX8-NEXT: s_bfe_u32 s7, s3, 0x40004 -; GFX8-NEXT: s_bfe_u32 s9, s3, 0x40008 -; GFX8-NEXT: s_bfe_u32 s11, s3, 0x4000c -; GFX8-NEXT: s_bfe_u32 s13, s3, 0x40010 -; GFX8-NEXT: s_bfe_u32 s15, s3, 0x40014 -; GFX8-NEXT: s_bfe_u32 s17, s3, 0x40018 -; GFX8-NEXT: s_lshr_b32 s3, s3, 28 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40004 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s17, s1, 0x40018 +; GFX8-NEXT: s_lshr_b32 s1, s1, 28 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, s1, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 @@ -3017,41 +3041,42 @@ ; GFX8-NEXT: v_mad_u32_u24 v0, s15, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 ; GFX8-NEXT: v_mad_u32_u24 v2, s17, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_variant1: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, 15 -; GFX9-NEXT: s_and_b32 s5, s3, 15 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s16, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: s_and_b32 s4, s0, 15 +; GFX9-NEXT: s_and_b32 s5, s1, 15 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: v_mad_u32_u24 v0, s5, v0, v1 -; GFX9-NEXT: s_bfe_u32 s7, s3, 0x40004 -; GFX9-NEXT: s_bfe_u32 s9, s3, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40018 -; GFX9-NEXT: s_lshr_b32 s3, s3, 28 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x40004 +; GFX9-NEXT: s_bfe_u32 s9, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s17, s1, 0x40018 +; GFX9-NEXT: s_lshr_b32 s1, s1, 28 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mad_u32_u24 v0, s1, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3064,43 +3089,45 @@ ; GFX9-NEXT: v_mad_u32_u24 v0, s15, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: v_mad_u32_u24 v2, s17, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_variant1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_nop 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s1, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_variant1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi +; GFX10-DL-NEXT: s_nop 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm i32 addrspace(1)* %v2addr, diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll --- a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll @@ -17,6 +17,7 @@ ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_short v[0:1], v1, off ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -36,6 +37,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_short v[0:1], v1, off ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -54,6 +56,7 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 ; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1 ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 @@ -80,6 +83,7 @@ ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_short v[0:1], v1, off ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -99,6 +103,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_short v[0:1], v1, off ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -117,6 +122,7 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 ; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1 ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 @@ -143,6 +149,7 @@ ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -162,6 +169,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -180,6 +188,7 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 ; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1 ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 @@ -206,6 +215,7 @@ ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -225,6 +235,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -243,6 +254,7 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1 ; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_nop 0 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1 ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2 @@ -269,6 +281,7 @@ ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 unorm tfe d16 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -288,6 +301,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -345,6 +359,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf unorm tfe d16 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[1:2], off ; GFX9-NEXT: global_store_dword v[0:1], v3, off @@ -365,6 +380,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe d16 ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[1:2], off ; GFX10-NEXT: global_store_dword v[0:1], v3, off diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll --- a/llvm/test/CodeGen/AMDGPU/imm.ll +++ b/llvm/test/CodeGen/AMDGPU/imm.ll @@ -435,11 +435,11 @@ ; VI-LABEL: add_inline_imm_0.0_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 0 +; VI-NEXT: v_add_f32_e64 v0, s2, 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0.0 @@ -462,11 +462,11 @@ ; VI-LABEL: add_inline_imm_0.5_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 0.5 +; VI-NEXT: v_add_f32_e64 v0, s2, 0.5 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0.5 @@ -489,11 +489,11 @@ ; VI-LABEL: add_inline_imm_neg_0.5_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, -0.5 +; VI-NEXT: v_add_f32_e64 v0, s2, -0.5 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -0.5 @@ -516,11 +516,11 @@ ; VI-LABEL: add_inline_imm_1.0_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s2, 1.0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 1.0 @@ -543,11 +543,11 @@ ; VI-LABEL: add_inline_imm_neg_1.0_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, -1.0 +; VI-NEXT: v_add_f32_e64 v0, s2, -1.0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -1.0 @@ -570,11 +570,11 @@ ; VI-LABEL: add_inline_imm_2.0_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 2.0 +; VI-NEXT: v_add_f32_e64 v0, s2, 2.0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 2.0 @@ -597,11 +597,11 @@ ; VI-LABEL: add_inline_imm_neg_2.0_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, -2.0 +; VI-NEXT: v_add_f32_e64 v0, s2, -2.0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -2.0 @@ -624,11 +624,11 @@ ; VI-LABEL: add_inline_imm_4.0_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 4.0 +; VI-NEXT: v_add_f32_e64 v0, s2, 4.0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 4.0 @@ -651,11 +651,11 @@ ; VI-LABEL: add_inline_imm_neg_4.0_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, -4.0 +; VI-NEXT: v_add_f32_e64 v0, s2, -4.0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -4.0 @@ -762,11 +762,11 @@ ; VI-LABEL: add_inline_imm_1_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 1 +; VI-NEXT: v_add_f32_e64 v0, s2, 1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36a0000000000000 @@ -789,11 +789,11 @@ ; VI-LABEL: add_inline_imm_2_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 2 +; VI-NEXT: v_add_f32_e64 v0, s2, 2 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36b0000000000000 @@ -816,11 +816,11 @@ ; VI-LABEL: add_inline_imm_16_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 16 +; VI-NEXT: v_add_f32_e64 v0, s2, 16 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36e0000000000000 @@ -844,11 +844,11 @@ ; VI-LABEL: add_inline_imm_neg_1_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s0, s0, -1 +; VI-NEXT: s_add_i32 s0, s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -875,11 +875,11 @@ ; VI-LABEL: add_inline_imm_neg_2_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s0, s0, -2 +; VI-NEXT: s_add_i32 s0, s2, -2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -906,11 +906,11 @@ ; VI-LABEL: add_inline_imm_neg_16_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s0, s0, -16 +; VI-NEXT: s_add_i32 s0, s2, -16 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -936,11 +936,11 @@ ; VI-LABEL: add_inline_imm_63_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 63 +; VI-NEXT: v_add_f32_e64 v0, s2, 63 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36ff800000000000 @@ -963,11 +963,11 @@ ; VI-LABEL: add_inline_imm_64_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s0, 64 +; VI-NEXT: v_add_f32_e64 v0, s2, 64 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x3700000000000000 @@ -990,6 +990,7 @@ ; VI-LABEL: add_inline_imm_0.0_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0 @@ -1017,6 +1018,7 @@ ; VI-LABEL: add_inline_imm_0.5_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.5 @@ -1044,6 +1046,7 @@ ; VI-LABEL: add_inline_imm_neg_0.5_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], -0.5 @@ -1071,6 +1074,7 @@ ; VI-LABEL: add_inline_imm_1.0_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 @@ -1098,6 +1102,7 @@ ; VI-LABEL: add_inline_imm_neg_1.0_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], -1.0 @@ -1125,6 +1130,7 @@ ; VI-LABEL: add_inline_imm_2.0_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2.0 @@ -1152,6 +1158,7 @@ ; VI-LABEL: add_inline_imm_neg_2.0_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], -2.0 @@ -1179,6 +1186,7 @@ ; VI-LABEL: add_inline_imm_4.0_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 4.0 @@ -1206,6 +1214,7 @@ ; VI-LABEL: add_inline_imm_neg_4.0_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], -4.0 @@ -1235,6 +1244,7 @@ ; VI-LABEL: add_inline_imm_inv_2pi_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.15915494309189532 @@ -1293,6 +1303,7 @@ ; VI-LABEL: add_inline_imm_1_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1 @@ -1320,6 +1331,7 @@ ; VI-LABEL: add_inline_imm_2_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2 @@ -1347,6 +1359,7 @@ ; VI-LABEL: add_inline_imm_16_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 16 @@ -1455,6 +1468,7 @@ ; VI-LABEL: add_inline_imm_63_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 63 @@ -1482,6 +1496,7 @@ ; VI-LABEL: add_inline_imm_64_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], s[2:3], 64 diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -512,11 +512,11 @@ ; VI-LABEL: add_inline_imm_0.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x00,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x00,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -554,11 +554,11 @@ ; VI-LABEL: add_inline_imm_0.5_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe0,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe0,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -596,11 +596,11 @@ ; VI-LABEL: add_inline_imm_neg_0.5_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe2,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe2,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -638,11 +638,11 @@ ; VI-LABEL: add_inline_imm_1.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe4,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe4,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -680,11 +680,11 @@ ; VI-LABEL: add_inline_imm_neg_1.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe6,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe6,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -722,11 +722,11 @@ ; VI-LABEL: add_inline_imm_2.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe8,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe8,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -764,11 +764,11 @@ ; VI-LABEL: add_inline_imm_neg_2.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xea,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xea,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -806,11 +806,11 @@ ; VI-LABEL: add_inline_imm_4.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xec,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xec,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -848,11 +848,11 @@ ; VI-LABEL: add_inline_imm_neg_4.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xee,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xee,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -1016,11 +1016,11 @@ ; VI-LABEL: add_inline_imm_1_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 1 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x02,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 1 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x02,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -1058,11 +1058,11 @@ ; VI-LABEL: add_inline_imm_2_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 2 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x04,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 2 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x04,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -1100,11 +1100,11 @@ ; VI-LABEL: add_inline_imm_16_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 16 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x20,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 16 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x20,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -1328,11 +1328,11 @@ ; VI-LABEL: add_inline_imm_63_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 63 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x7e,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 63 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x7e,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -1370,11 +1370,11 @@ ; VI-LABEL: add_inline_imm_64_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 ; encoding: [0x02,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 64 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x80,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 64 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x80,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s ; FIXME: Merge into imm.ll diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -273,7 +273,7 @@ ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} ; MOVREL: v_movreld_b32_e32 v0, 5 -; IDXMODE: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0xfffffe00{{$}} +; IDXMODE: s_addk_i32 s{{[0-9]+}}, 0xfe00{{$}} ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(DST) ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 ; IDXMODE-NEXT: s_set_gpr_idx_off diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -28,15 +28,15 @@ ; VI-LABEL: insertelement_v4f32_0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, 0x40a00000 +; VI-NEXT: s_mov_b32 s8, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 @@ -63,15 +63,15 @@ ; VI-LABEL: insertelement_v4f32_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s5, 0x40a00000 +; VI-NEXT: s_mov_b32 s9, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 @@ -98,15 +98,15 @@ ; VI-LABEL: insertelement_v4f32_2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s6, 0x40a00000 +; VI-NEXT: s_mov_b32 s10, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 @@ -133,15 +133,15 @@ ; VI-LABEL: insertelement_v4f32_3: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s7, 0x40a00000 +; VI-NEXT: s_mov_b32 s11, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 @@ -168,15 +168,15 @@ ; VI-LABEL: insertelement_v4i32_0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_movk_i32 s4, 0x3e7 +; VI-NEXT: s_movk_i32 s8, 0x3e7 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x i32> %a, i32 999, i32 0 @@ -200,15 +200,15 @@ ; ; VI-LABEL: insertelement_v3f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 @@ -231,15 +231,15 @@ ; ; VI-LABEL: insertelement_v3f32_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 @@ -302,16 +302,16 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s8, s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s8, 1 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s8, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -344,23 +344,23 @@ ; ; VI-LABEL: dynamic_insertelement_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s4, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x20 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s6, 2 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s6, 1 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s8 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s6, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 @@ -394,26 +394,26 @@ ; ; VI-LABEL: dynamic_insertelement_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s4, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x20 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s11 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s6, 3 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s6, 2 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s6, 1 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cmp_ne_u32_e64 vcc, s6, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 @@ -462,12 +462,13 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v5, s15 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s10 @@ -479,7 +480,6 @@ ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-NEXT: v_mov_b32_e32 v5, s15 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 ; VI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc ; VI-NEXT: v_mov_b32_e32 v5, s14 @@ -535,12 +535,11 @@ ; ; VI-LABEL: dynamic_insertelement_v16f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dword s6, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 @@ -558,7 +557,8 @@ ; VI-NEXT: v_mov_b32_e32 v13, s21 ; VI-NEXT: v_mov_b32_e32 v14, s22 ; VI-NEXT: v_mov_b32_e32 v15, s23 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: s_mov_b32 m0, s6 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v0, v16 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 @@ -592,16 +592,16 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s8, s[4:5], 0x10 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 1 -; VI-NEXT: s_cselect_b32 s5, s7, 5 -; VI-NEXT: s_cmp_lg_u32 s4, 0 -; VI-NEXT: s_cselect_b32 s4, s6, 5 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_cmp_lg_u32 s8, 1 +; VI-NEXT: s_cselect_b32 s4, s7, 5 +; VI-NEXT: s_cmp_lg_u32 s8, 0 +; VI-NEXT: s_cselect_b32 s5, s6, 5 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i32> %a, i32 5, i32 %b @@ -632,22 +632,23 @@ ; ; VI-LABEL: dynamic_insertelement_v3i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s4, s[4:5], 0x20 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 2 -; VI-NEXT: s_cselect_b32 s5, s10, 5 -; VI-NEXT: s_cmp_lg_u32 s4, 1 -; VI-NEXT: s_cselect_b32 s6, s9, 5 -; VI-NEXT: s_cmp_lg_u32 s4, 0 -; VI-NEXT: s_cselect_b32 s4, s8, 5 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; VI-NEXT: s_load_dword s3, s[4:5], 0x20 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s3, 2 +; VI-NEXT: s_cselect_b32 s2, s2, 5 +; VI-NEXT: s_cmp_lg_u32 s3, 1 +; VI-NEXT: s_cselect_b32 s1, s1, 5 +; VI-NEXT: s_cmp_lg_u32 s3, 0 +; VI-NEXT: s_cselect_b32 s0, s0, 5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x i32> %a, i32 5, i32 %b store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16 @@ -682,26 +683,26 @@ ; ; VI-LABEL: dynamic_insertelement_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_load_dword s6, s[4:5], 0x20 -; VI-NEXT: s_load_dword s4, s[4:5], 0x44 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dword s7, s[4:5], 0x44 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s6, 3 -; VI-NEXT: s_cselect_b32 s5, s4, s11 +; VI-NEXT: s_cselect_b32 s3, s7, s3 ; VI-NEXT: s_cmp_eq_u32 s6, 2 -; VI-NEXT: s_cselect_b32 s7, s4, s10 +; VI-NEXT: s_cselect_b32 s2, s7, s2 ; VI-NEXT: s_cmp_eq_u32 s6, 1 -; VI-NEXT: s_cselect_b32 s9, s4, s9 +; VI-NEXT: s_cselect_b32 s1, s7, s1 ; VI-NEXT: s_cmp_eq_u32 s6, 0 -; VI-NEXT: s_cselect_b32 s4, s4, s8 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_cselect_b32 s0, s7, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 @@ -749,8 +750,8 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s4, 3 @@ -820,9 +821,9 @@ ; ; VI-LABEL: dynamic_insertelement_v16i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dword s6, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -842,7 +843,7 @@ ; VI-NEXT: v_mov_b32_e32 v13, s21 ; VI-NEXT: v_mov_b32_e32 v14, s22 ; VI-NEXT: v_mov_b32_e32 v15, s23 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: s_mov_b32 m0, s6 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 @@ -875,13 +876,13 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 -; VI-NEXT: s_load_dword s4, s[4:5], 0xc +; VI-NEXT: s_load_dword s7, s[4:5], 0xc ; VI-NEXT: v_mov_b32_e32 v0, 0x50005 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: s_lshl_b32 s4, s4, 4 +; VI-NEXT: s_lshl_b32 s4, s7, 4 ; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -919,18 +920,18 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s8, s[4:5], 0x10 ; VI-NEXT: s_mov_b32 s5, 0 +; VI-NEXT: s_mov_b32 s4, 0xffff ; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_lshl_b32 s8, s4, 4 -; VI-NEXT: s_mov_b32 s4, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 4 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 ; VI-NEXT: s_mov_b32 s8, 0x50005 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_bfi_b32 v0, s5, v0, v1 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, s8 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_bfi_b32 v1, s4, v1, v2 @@ -963,11 +964,11 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s6, s[4:5], 0x28 -; VI-NEXT: s_load_dword s4, s[4:5], 0x4c +; VI-NEXT: s_load_dword s7, s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 3 +; VI-NEXT: s_lshl_b32 s4, s7, 3 ; VI-NEXT: v_lshlrev_b16_e64 v0, s4, -1 ; VI-NEXT: v_and_b32_e32 v1, 0x505, v0 ; VI-NEXT: v_xor_b32_e32 v0, -1, v0 @@ -1005,13 +1006,13 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s6, s[4:5], 0x28 -; VI-NEXT: s_load_dword s4, s[4:5], 0x4c +; VI-NEXT: s_load_dword s7, s[4:5], 0x4c ; VI-NEXT: v_mov_b32_e32 v0, 0x5050505 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: s_lshl_b32 s4, s4, 3 +; VI-NEXT: s_lshl_b32 s4, s7, 3 ; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -1044,13 +1045,13 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s6, s[4:5], 0x28 -; VI-NEXT: s_load_dword s4, s[4:5], 0x4c +; VI-NEXT: s_load_dword s7, s[4:5], 0x4c ; VI-NEXT: v_mov_b32_e32 v0, 0x5050505 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: s_lshl_b32 s4, s4, 3 +; VI-NEXT: s_lshl_b32 s4, s7, 3 ; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1230,8 +1231,8 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s4, s[4:5], 0x20 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_load_dword s4, s[4:5], 0x20 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s11, 24 @@ -1354,6 +1355,7 @@ ; VI-LABEL: insert_split_bb: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dword s0, s[4:5], 0x10 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s0, 0 @@ -1421,24 +1423,24 @@ ; ; VI-LABEL: dynamic_insertelement_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x30 -; VI-NEXT: s_load_dword s4, s[4:5], 0x60 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30 ; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dword s4, s[4:5], 0x60 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x double> %a, double 8.0, i32 %b store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16 @@ -1469,23 +1471,23 @@ ; ; VI-LABEL: dynamic_insertelement_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_load_dword s6, s[4:5], 0x20 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 ; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 -; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s6, 0 +; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[2:3] +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i64> %a, i64 5, i32 %b store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8 @@ -1589,12 +1591,13 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: v_mov_b32_e32 v4, 0x40200000 +; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v5, s15 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v0, s10 @@ -1604,7 +1607,6 @@ ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-NEXT: v_mov_b32_e32 v5, s15 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3 ; VI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc ; VI-NEXT: v_mov_b32_e32 v5, s14 @@ -1660,15 +1662,14 @@ ; ; VI-LABEL: dynamic_insertelement_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dword s6, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v16, 0x40200000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_lshl_b32 s4, s4, 1 +; VI-NEXT: s_lshl_b32 s4, s6, 1 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 @@ -1686,6 +1687,7 @@ ; VI-NEXT: v_mov_b32_e32 v15, s23 ; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: v_movreld_b32_e32 v0, 0 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v1, v16 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -41,13 +41,13 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s0, s4, s0 +; GFX9-NEXT: s_pack_lh_b32_b16 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -55,12 +55,12 @@ ; VI-LABEL: s_insertelement_v2i16_0_reg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: s_and_b32 s1, s6, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 ; VI-NEXT: s_or_b32 s0, s1, s0 @@ -93,14 +93,14 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s0 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: ;;#ASMSTART @@ -111,12 +111,12 @@ ; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: s_and_b32 s1, s6, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s0, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 @@ -160,13 +160,13 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s0, s4, s0 +; GFX9-NEXT: s_pack_hh_b32_b16 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -174,12 +174,12 @@ ; VI-LABEL: s_insertelement_v2i16_0_reghi: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s1, s4, 16 +; VI-NEXT: s_lshr_b32 s1, s6, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 ; VI-NEXT: s_or_b32 s0, s1, s0 @@ -214,11 +214,11 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_lshr_b32 s0, s6, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_lh_b32_b16 s1, s0, s2 @@ -232,12 +232,12 @@ ; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: s_lshr_b32 s0, s6, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 ; VI-NEXT: s_or_b32 s1, s0, s1 @@ -280,12 +280,12 @@ ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s1, s0 @@ -302,12 +302,12 @@ ; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s1, s4, 16 +; VI-NEXT: s_lshr_b32 s1, s6, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s0, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 @@ -394,13 +394,13 @@ ; GFX9-LABEL: s_insertelement_v2i16_1_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -408,12 +408,12 @@ ; VI-LABEL: s_insertelement_v2i16_1_reg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshl_b32 s1, s4, 16 +; VI-NEXT: s_lshl_b32 s1, s6, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: s_or_b32 s0, s0, s1 @@ -578,7 +578,7 @@ ; GFX9-LABEL: v_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff0000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -586,7 +586,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e64 v1, 16, s4 +; GFX9-NEXT: v_lshrrev_b32_e64 v1, 16, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc @@ -598,7 +598,7 @@ ; VI-LABEL: v_insertelement_v2i16_0_reghi: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -607,7 +607,7 @@ ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: s_lshr_b32 s0, s6, 16 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -1103,17 +1103,17 @@ ; GFX9-LABEL: s_insertelement_v2i16_dynamic: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s0, s4, 4 ; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -1121,17 +1121,17 @@ ; VI-LABEL: s_insertelement_v2i16_dynamic: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x0 -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x0 +; VI-NEXT: s_load_dword s5, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s0, s4, 4 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 -; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -1164,7 +1164,7 @@ ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1172,7 +1172,7 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: s_lshl_b32 s0, s4, 4 +; GFX9-NEXT: s_lshl_b32 s0, s6, 4 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3e703e7 @@ -1185,7 +1185,7 @@ ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1193,7 +1193,7 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b32 s0, s4, 4 +; VI-NEXT: s_lshl_b32 s0, s6, 4 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 ; VI-NEXT: v_mov_b32_e32 v1, 0x3e703e7 @@ -1237,54 +1237,54 @@ ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_mov_b32 s0, 0x12341234 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v5 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, v2, s0, v3 +; GFX9-NEXT: v_bfi_b32 v2, v2, s0, v6 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v5, v[2:3] +; VI-NEXT: flat_load_dword v6, v[0:1] ; VI-NEXT: s_mov_b32 s2, 0xffff ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_mov_b32 s0, 0x12341234 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v5 ; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_bfi_b32 v2, v2, s0, v3 +; VI-NEXT: v_bfi_b32 v2, v2, s0, v6 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1329,7 +1329,7 @@ ; GFX9-LABEL: v_insertelement_v4f16_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1341,14 +1341,14 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v0, v4, s4, v0 +; GFX9-NEXT: v_bfi_b32 v0, v4, s6, v0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1357,7 +1357,7 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_and_b32 s0, s4, 0xffff +; VI-NEXT: s_and_b32 s0, s6, 0xffff ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -1400,7 +1400,7 @@ ; GFX9-LABEL: v_insertelement_v4f16_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1412,14 +1412,14 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, s4, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, s6, 16, v0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1427,7 +1427,7 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b32 s0, s4, 16 +; VI-NEXT: s_lshl_b32 s0, s6, 16 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1471,7 +1471,7 @@ ; GFX9-LABEL: v_insertelement_v4f16_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1483,14 +1483,14 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v4, s4, v1 +; GFX9-NEXT: v_bfi_b32 v1, v4, s6, v1 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1499,7 +1499,7 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_and_b32 s0, s4, 0xffff +; VI-NEXT: s_and_b32 s0, s6, 0xffff ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -1542,7 +1542,7 @@ ; GFX9-LABEL: v_insertelement_v4f16_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1554,14 +1554,14 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_3: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1569,7 +1569,7 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b32 s0, s4, 16 +; VI-NEXT: s_lshl_b32 s0, s6, 16 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1613,7 +1613,7 @@ ; GFX9-LABEL: v_insertelement_v4i16_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1625,14 +1625,14 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v4, s4, v1 +; GFX9-NEXT: v_bfi_b32 v1, v4, s6, v1 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4i16_2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1641,7 +1641,7 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_and_b32 s0, s4, 0xffff +; VI-NEXT: s_and_b32 s0, s6, 0xffff ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -1685,55 +1685,55 @@ ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v2, v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s6, s6 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v5 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, s[2:3] +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v3, s1, v1 -; GFX9-NEXT: v_bfi_b32 v0, v2, s1, v0 +; GFX9-NEXT: v_bfi_b32 v1, v1, s1, v3 +; GFX9-NEXT: v_bfi_b32 v0, v0, s1, v2 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dword v5, v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_mov_b32 s2, 0xffff -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 +; VI-NEXT: s_and_b32 s1, s6, s2 ; VI-NEXT: s_mov_b32 s3, 0 -; VI-NEXT: s_and_b32 s1, s4, s2 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_lshl_b32 s0, s1, 16 ; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v5 ; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_bfi_b32 v1, v5, s0, v1 -; VI-NEXT: v_bfi_b32 v0, v4, s0, v0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_bfi_b32 v3, v5, s0, v3 +; VI-NEXT: v_bfi_b32 v2, v4, s0, v2 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr: @@ -1780,18 +1780,18 @@ ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_lshl_b32 s1, s5, 4 +; GFX9-NEXT: s_lshl_b32 s1, s7, 4 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s4 @@ -1806,7 +1806,7 @@ ; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1816,8 +1816,8 @@ ; VI-NEXT: s_mov_b32 s2, 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_mov_b32 s3, 0 -; VI-NEXT: s_lshl_b32 s1, s5, 4 -; VI-NEXT: s_and_b32 s4, s4, s2 +; VI-NEXT: s_lshl_b32 s1, s7, 4 +; VI-NEXT: s_and_b32 s4, s6, s2 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 ; VI-NEXT: s_lshl_b32 s2, s4, 16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -19,10 +19,10 @@ ; VI-LABEL: s_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -31,10 +31,10 @@ ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -59,10 +59,10 @@ ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, s4 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -70,10 +70,10 @@ ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -114,44 +114,44 @@ ; VI-LABEL: v_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v6, v[0:1] +; VI-NEXT: flat_load_dword v7, v[2:3] ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v6, v7 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v6, v7 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -309,44 +309,44 @@ ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v6, v[0:1] +; VI-NEXT: flat_load_dword v7, v[2:3] ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v6, v7 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v6, v7 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -385,44 +385,44 @@ ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v6, v[0:1] +; VI-NEXT: flat_load_dword v7, v[2:3] ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v6, -v7 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v6, -v7 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -461,44 +461,44 @@ ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v6, v[0:1] +; VI-NEXT: flat_load_dword v7, v[2:3] ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v6, -v7 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v6, -v7 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -538,44 +538,44 @@ ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v6, v[0:1] +; VI-NEXT: flat_load_dword v7, v[2:3] ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v6|, -v7 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v6|, -v7 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -3208,6 +3208,7 @@ ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm ; FIJI-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf unorm +; FIJI-NEXT: s_nop 0 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf unorm ; FIJI-NEXT: s_endpgm @@ -3216,6 +3217,7 @@ ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm ; GFX6789-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf unorm +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf unorm ; GFX6789-NEXT: s_endpgm @@ -3224,6 +3226,7 @@ ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm ; NOPRT-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf unorm +; NOPRT-NEXT: s_nop 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm @@ -3233,6 +3236,7 @@ ; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x20,0xf0,0x04,0x00,0x00,0x00] ; GFX10-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x00,0xf0,0x04,0x00,0x02,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x20,0xf0,0x04,0x00,0x04,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll @@ -32,8 +32,9 @@ } ; GCN-LABEL: {{^}}sample_contig_nsa: -; GCN: image_sample_c_l v0, v[0:7], -; NSA: image_sample v1, [v6, v7, v5], +; NONSA: image_sample_c_l v0, v[0:7], +; NSA: image_sample_c_l v8, v[0:7], +; NSA: image_sample v9, [v6, v7, v5], define amdgpu_ps <2 x float> @sample_contig_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -44,8 +45,8 @@ } ; GCN-LABEL: {{^}}sample_nsa_nsa: -; NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0], -; NSA: image_sample v1, [v6, v7, v5], +; NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0], +; NSA: image_sample v9, [v6, v7, v5], define amdgpu_ps <2 x float> @sample_nsa_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %r2, float %s2, float %t2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -56,8 +57,8 @@ } ; GCN-LABEL: {{^}}sample_nsa_contig: -; NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0], -; NSA: image_sample v1, v[5:7], +; NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0], +; NSA: image_sample v9, v[5:7], define amdgpu_ps <2 x float> @sample_nsa_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %s2, float %t2, float %r2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -68,8 +69,9 @@ } ; GCN-LABEL: {{^}}sample_contig_contig: -; GCN: image_sample_c_l v0, v[0:7], -; NSA: image_sample v1, v[5:7], +; NSA: image_sample_c_l v8, v[0:7], +; NSA: image_sample v9, v[5:7], +; NONSA: image_sample_c_l v0, v[0:7], ; NONSA: image_sample v1, v[5:7], define amdgpu_ps <2 x float> @sample_contig_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %s2, float %t2, float %r2) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -70,6 +70,7 @@ ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6789-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf tfe +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[6:7], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) @@ -90,6 +91,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0f,0x81,0xf0,0x05,0x00,0x40,0x00] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] @@ -492,6 +494,7 @@ ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6789-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf lwe +; GFX6789-NEXT: s_nop 0 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[6:7], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) @@ -512,6 +515,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; encoding: [0x00,0x0f,0x82,0xf0,0x05,0x00,0x40,0x00] +; GFX10-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -50,11 +50,11 @@ ; VARIANT2-LABEL: test_barrier: ; VARIANT2: ; %bb.0: ; %entry ; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c +; VARIANT2-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT2-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT2-NEXT: v_mov_b32_e32 v2, s3 -; VARIANT2-NEXT: v_xad_u32 v3, v0, -1, s0 +; VARIANT2-NEXT: v_xad_u32 v3, v0, -1, s4 ; VARIANT2-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT2-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 ; VARIANT2-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] @@ -66,6 +66,7 @@ ; VARIANT2-NEXT: s_waitcnt vmcnt(0) ; VARIANT2-NEXT: s_barrier ; VARIANT2-NEXT: global_load_dword v0, v[3:4], off +; VARIANT2-NEXT: s_nop 0 ; VARIANT2-NEXT: s_waitcnt vmcnt(0) ; VARIANT2-NEXT: global_store_dword v[1:2], v0, off ; VARIANT2-NEXT: s_endpgm @@ -73,11 +74,11 @@ ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry ; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c +; VARIANT3-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT3-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT3-NEXT: v_mov_b32_e32 v2, s3 -; VARIANT3-NEXT: v_xad_u32 v3, v0, -1, s0 +; VARIANT3-NEXT: v_xad_u32 v3, v0, -1, s4 ; VARIANT3-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT3-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 ; VARIANT3-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] @@ -88,6 +89,7 @@ ; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc ; VARIANT3-NEXT: s_barrier ; VARIANT3-NEXT: global_load_dword v0, v[3:4], off +; VARIANT3-NEXT: s_nop 0 ; VARIANT3-NEXT: s_waitcnt vmcnt(0) ; VARIANT3-NEXT: global_store_dword v[1:2], v0, off ; VARIANT3-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -18,12 +18,12 @@ ; VI-LABEL: bfe_u32_arg_arg_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_bfe_u32 v0, v0, s1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_bfe_u32 v0, v0, s3, s3 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1) @@ -48,13 +48,13 @@ ; VI-LABEL: bfe_u32_arg_arg_imm: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_mov_b32_e32 v1, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_bfe_u32 v0, s0, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_bfe_u32 v0, s2, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123) @@ -79,13 +79,13 @@ ; VI-LABEL: bfe_u32_arg_imm_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_mov_b32_e32 v0, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_bfe_u32 v0, s0, v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_bfe_u32 v0, s2, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2) @@ -111,14 +111,14 @@ ; VI-LABEL: bfe_u32_imm_arg_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_movk_i32 s2, 0x7b +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_movk_i32 s0, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_bfe_u32 v0, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_bfe_u32 v0, s0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2) @@ -207,6 +207,7 @@ ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1559,22 +1560,22 @@ ; VI-LABEL: simplify_bfe_u32_multi_use_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 63, v0 ; VI-NEXT: v_bfe_u32 v1, v0, 2, 2 -; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { @@ -1602,11 +1603,11 @@ ; VI-LABEL: lshr_and: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s0, s0, 0x30006 +; VI-NEXT: s_bfe_u32 s0, s2, 0x30006 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1632,12 +1633,12 @@ ; VI-LABEL: v_lshr_and: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_bfe_u32 v0, s0, v0, 3 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_bfe_u32 v0, s2, v0, 3 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %c = lshr i32 %a, %b @@ -1662,11 +1663,11 @@ ; VI-LABEL: and_lshr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s0, s0, 0x30006 +; VI-NEXT: s_bfe_u32 s0, s2, 0x30006 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1692,11 +1693,11 @@ ; VI-LABEL: and_lshr2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s0, s0, 0x30006 +; VI-NEXT: s_bfe_u32 s0, s2, 0x30006 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1722,11 +1723,11 @@ ; VI-LABEL: shl_lshr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s0, s0, 0x150002 +; VI-NEXT: s_bfe_u32 s0, s2, 0x150002 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -292,17 +292,17 @@ ; GFX9-LABEL: maxnum_v2f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s6, s6 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -501,18 +501,18 @@ ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v1, s4, s4 -; VI-NEXT: v_max_f16_e64 v0, s6, s6 +; VI-NEXT: v_max_f16_e64 v0, s10, s10 ; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: s_lshr_b32 s6, s6, 16 +; VI-NEXT: s_lshr_b32 s6, s10, 16 ; VI-NEXT: v_max_f16_e32 v0, v1, v0 ; VI-NEXT: v_max_f16_e64 v1, s6, s6 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_max_f16_e64 v1, s11, s11 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_e32 v1, v2, v1 ; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 @@ -529,12 +529,12 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: v_pk_max_f16 v0, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 @@ -610,21 +610,21 @@ ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v1, s5, s5 -; VI-NEXT: v_max_f16_e64 v0, s7, s7 +; VI-NEXT: v_max_f16_e64 v0, s11, s11 ; VI-NEXT: s_lshr_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: s_lshr_b32 s6, s11, 16 ; VI-NEXT: v_max_f16_e32 v0, v1, v0 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 -; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_max_f16_e64 v1, s6, s6 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_max_f16_e64 v2, s4, s4 -; VI-NEXT: v_max_f16_e64 v0, s6, s6 +; VI-NEXT: v_max_f16_e64 v0, s10, s10 ; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: s_lshr_b32 s5, s6, 16 +; VI-NEXT: s_lshr_b32 s5, s10, 16 ; VI-NEXT: v_max_f16_e32 v0, v2, v0 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_e64 v3, s4, s4 @@ -643,12 +643,12 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -315,17 +315,17 @@ ; GFX9-LABEL: minnum_v2f16_ieee: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s6, s6 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -554,18 +554,18 @@ ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v1, s4, s4 -; VI-NEXT: v_max_f16_e64 v0, s6, s6 +; VI-NEXT: v_max_f16_e64 v0, s10, s10 ; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: s_lshr_b32 s6, s6, 16 +; VI-NEXT: s_lshr_b32 s6, s10, 16 ; VI-NEXT: v_min_f16_e32 v0, v1, v0 ; VI-NEXT: v_max_f16_e64 v1, s6, s6 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_max_f16_e64 v1, s11, s11 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_min_f16_e32 v1, v2, v1 ; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 @@ -582,12 +582,12 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: v_pk_max_f16 v0, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 @@ -663,21 +663,21 @@ ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v1, s5, s5 -; VI-NEXT: v_max_f16_e64 v0, s7, s7 +; VI-NEXT: v_max_f16_e64 v0, s11, s11 ; VI-NEXT: s_lshr_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: s_lshr_b32 s6, s11, 16 ; VI-NEXT: v_min_f16_e32 v0, v1, v0 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 -; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_max_f16_e64 v1, s6, s6 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_max_f16_e64 v2, s4, s4 -; VI-NEXT: v_max_f16_e64 v0, s6, s6 +; VI-NEXT: v_max_f16_e64 v0, s10, s10 ; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: s_lshr_b32 s5, s6, 16 +; VI-NEXT: s_lshr_b32 s5, s10, 16 ; VI-NEXT: v_min_f16_e32 v0, v2, v0 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_e64 v3, s4, s4 @@ -696,12 +696,12 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -254,6 +254,7 @@ ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg: ; GCN: s_waitcnt ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword ; GFX900-NEXT: s_waitcnt @@ -271,6 +272,7 @@ ; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg: ; GCN: s_waitcnt ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword ; GFX900-NEXT: s_waitcnt @@ -288,6 +290,7 @@ ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8: ; GCN: s_waitcnt ; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword ; GFX900-NEXT: s_waitcnt @@ -306,6 +309,7 @@ ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt ; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword ; GFX900-NEXT: s_waitcnt @@ -324,6 +328,7 @@ ; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_sexti8: ; GCN: s_waitcnt ; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword ; GFX900-NEXT: s_waitcnt @@ -343,6 +348,7 @@ ; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_zexti8: ; GCN: s_waitcnt ; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword ; GFX900-NEXT: s_waitcnt @@ -362,6 +368,7 @@ ; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg: ; GCN: s_waitcnt ; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v2 ; GFX900-NEXT: s_waitcnt @@ -383,6 +390,7 @@ ; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg: ; GCN: s_waitcnt ; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v2 ; GFX900-NEXT: s_waitcnt @@ -404,6 +412,7 @@ ; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8: ; GCN: s_waitcnt ; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v2 ; GFX900-NEXT: s_waitcnt @@ -426,6 +435,7 @@ ; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt ; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v2 ; GFX900-NEXT: s_waitcnt @@ -448,6 +458,7 @@ ; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_zexti8: ; GCN: s_waitcnt ; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v2 ; GFX900-NEXT: s_waitcnt @@ -471,6 +482,7 @@ ; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_sexti8: ; GCN: s_waitcnt ; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v2 ; GFX900-NEXT: s_waitcnt @@ -494,6 +506,7 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg: ; GCN: s_waitcnt ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -513,6 +526,7 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg: ; GCN: s_waitcnt ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -532,6 +546,7 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff: ; GCN: s_waitcnt ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}} +; GFX900-NEXT: s_nop ; GFX900: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -550,6 +565,7 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff: ; GCN: s_waitcnt ; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt @@ -568,6 +584,7 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8: ; GCN: s_waitcnt ; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -588,6 +605,7 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8: ; GCN: s_waitcnt ; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -609,6 +627,7 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8: ; GCN: s_waitcnt ; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -630,6 +649,7 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt ; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt @@ -650,6 +670,7 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt ; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt @@ -669,6 +690,7 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8: ; GCN: s_waitcnt ; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt @@ -688,6 +710,7 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt ; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt @@ -708,6 +731,7 @@ ; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg: ; GCN: s_waitcnt ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword ; GFX900-NEXT: s_waitcnt @@ -728,6 +752,7 @@ ; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg ; GCN: s_waitcnt ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword ; GFX900-NEXT: s_waitcnt @@ -748,6 +773,7 @@ ; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_sexti8: ; GCN: s_waitcnt ; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword ; GFX900-NEXT: s_waitcnt @@ -767,6 +793,7 @@ ; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_zexti8: ; GCN: s_waitcnt ; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword ; GFX900-NEXT: s_waitcnt @@ -917,6 +944,7 @@ ; GCN-LABEL: {{^}}load_global_v2i16_split: ; GCN: s_waitcnt ; GFX900-NEXT: global_load_ushort v2 +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_load_short_d16_hi v2 ; GFX900-NEXT: s_waitcnt @@ -936,6 +964,7 @@ ; GCN-LABEL: {{^}}load_flat_v2i16_split: ; GCN: s_waitcnt ; GFX900-NEXT: flat_load_ushort v2 +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: flat_load_short_d16_hi v2 ; GFX900-NEXT: s_waitcnt @@ -955,6 +984,7 @@ ; GCN-LABEL: {{^}}load_constant_v2i16_split: ; GCN: s_waitcnt ; GFX900-NEXT: global_load_ushort v2 +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_load_short_d16_hi v2 ; GFX900-NEXT: s_waitcnt @@ -975,6 +1005,7 @@ ; GCN-LABEL: {{^}}load_private_v2i16_split: ; GCN: s_waitcnt ; GFX900: buffer_load_ushort v0, off, s[0:3], s32{{$}} +; GFX900-NEXT: s_nop ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2 ; GFX900-NEXT: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -671,6 +671,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -713,6 +714,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -756,6 +758,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -800,6 +803,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -843,6 +847,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -889,6 +894,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -934,6 +940,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: flat_load_short_d16 v2, v[0:1] +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -973,6 +980,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: flat_load_short_d16 v2, v[0:1] +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1015,6 +1023,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1] +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1056,6 +1065,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1] +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1096,6 +1106,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1] +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1139,6 +1150,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1] +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1181,6 +1193,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1263,6 +1276,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1304,6 +1318,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1343,6 +1358,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1382,6 +1398,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1422,6 +1439,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1464,6 +1482,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1505,6 +1524,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1546,6 +1566,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1586,6 +1607,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1629,6 +1651,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1671,6 +1694,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1714,6 +1738,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1760,6 +1785,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1807,6 +1833,7 @@ ; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1857,6 +1884,7 @@ ; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1908,6 +1936,7 @@ ; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1960,6 +1989,7 @@ ; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -2013,6 +2043,7 @@ ; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir b/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir --- a/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir +++ b/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir @@ -14,10 +14,10 @@ ; GCN: S_BRANCH %bb.1 ; GCN: bb.6 (align 64): ; GCN: successors: %bb.7(0x04000000), %bb.1(0x7c000000) - ; GCN: S_CBRANCH_VCCNZ %bb.7, implicit $vcc + ; GCN: S_CBRANCH_VCCNZ %bb.7, implicit $vcc_lo ; GCN: bb.1: ; GCN: successors: %bb.2(0x40000000), %bb.3(0x40000000) - ; GCN: S_CBRANCH_VCCNZ %bb.2, implicit $vcc + ; GCN: S_CBRANCH_VCCNZ %bb.2, implicit $vcc_lo ; GCN: bb.3: ; GCN: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; GCN: SI_MASK_BRANCH %bb.6, implicit $exec @@ -44,6 +44,7 @@ ; GCN: successors: %bb.6(0x80000000) ; GCN: S_BRANCH %bb.6 ; GCN: bb.7: + ; GCN: S_INST_PREFETCH 2 ; GCN: S_ENDPGM 0 bb.0: successors: %bb.1(0x80000000) diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -10,10 +10,10 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, s5, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -23,18 +23,18 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dword s5, s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x30 +; VI-NEXT: s_load_dword s6, s[0:1], 0x30 ; VI-NEXT: s_mov_b32 s4, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_lshr_b32 s1, s5, 16 -; VI-NEXT: s_lshr_b32 s6, s0, 16 -; VI-NEXT: s_lshr_b32 s1, s1, s6 -; VI-NEXT: s_and_b32 s5, s5, s4 -; VI-NEXT: s_and_b32 s0, s0, s4 -; VI-NEXT: s_lshr_b32 s0, s5, s0 -; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_and_b32 s1, s5, s4 +; VI-NEXT: s_lshr_b32 s0, s5, 16 +; VI-NEXT: s_lshr_b32 s5, s6, 16 +; VI-NEXT: s_lshr_b32 s0, s0, s5 +; VI-NEXT: s_and_b32 s4, s6, s4 +; VI-NEXT: s_lshr_b32 s1, s1, s4 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -75,12 +75,12 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v4 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, v5, v4 ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; @@ -95,14 +95,14 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v5 -; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_lshrrev_b16_e32 v2, v6, v5 +; VI-NEXT: v_lshrrev_b16_sdwa v3, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -147,7 +147,7 @@ ; GFX9-LABEL: lshr_v_s_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -158,27 +158,27 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v0, s0, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, s2, v0 ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_s_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: s_lshr_b32 s0, s2, 16 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v4, s0, v3 +; VI-NEXT: v_lshrrev_b16_e32 v4, s2, v3 ; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v2, v4, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -222,7 +222,7 @@ ; GFX9-LABEL: lshr_s_v_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -233,27 +233,27 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, s0 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, s2 ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_s_v_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: s_lshr_b32 s0, s2, 16 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e64 v4, v3, s0 +; VI-NEXT: v_lshrrev_b16_e64 v4, v3, s2 ; VI-NEXT: v_lshrrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v4, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -428,45 +428,45 @@ ; GFX9-LABEL: v_lshr_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v3 -; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_pk_lshrrev_b16 v3, v5, v3 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v2 +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v8 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1 -; VI-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v0 -; VI-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v1, v6, v1 -; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: v_lshrrev_b16_e32 v2, v7, v5 +; VI-NEXT: v_lshrrev_b16_sdwa v3, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshrrev_b16_e32 v5, v6, v4 +; VI-NEXT: v_lshrrev_b16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v3, v2, v3 +; VI-NEXT: v_or_b32_e32 v2, v5, v4 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_lshr_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -7,44 +7,44 @@ ; VI-LABEL: v_test_imax_sge_i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: flat_load_ushort v1, v[2:3] +; VI-NEXT: flat_load_ushort v6, v[0:1] +; VI-NEXT: flat_load_ushort v7, v[2:3] ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v0, v0, v1 +; VI-NEXT: v_max_i16_e32 v0, v6, v7 ; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-NEXT: global_load_ushort v7, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v0, v0, v1 +; GFX9-NEXT: v_max_i16_e32 v0, v6, v7 ; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -64,46 +64,46 @@ ; VI-LABEL: v_test_imax_sge_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v3, v5, v2 -; VI-NEXT: v_max_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_max_i16_e32 v2, v5, v6 +; VI-NEXT: v_max_i16_sdwa v3, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_max_i16 v0, v6, v7 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -123,33 +123,33 @@ ; VI-LABEL: v_test_imax_sge_v3i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v6 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v6 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ushort v4, v[4:5] -; VI-NEXT: flat_load_dword v5, v[0:1] +; VI-NEXT: flat_load_ushort v7, v[4:5] +; VI-NEXT: flat_load_dword v8, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v7, v[2:3] -; VI-NEXT: flat_load_ushort v8, v[0:1] +; VI-NEXT: flat_load_dword v5, v[2:3] +; VI-NEXT: flat_load_ushort v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_max_i16_e32 v6, v5, v7 -; VI-NEXT: v_max_i16_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_i16_e32 v6, v8, v5 +; VI-NEXT: v_max_i16_sdwa v5, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v4, v4, v8 +; VI-NEXT: v_max_i16_e32 v4, v7, v4 ; VI-NEXT: v_or_b32_e32 v5, v6, v5 ; VI-NEXT: flat_store_short v[2:3], v4 ; VI-NEXT: flat_store_dword v[0:1], v5 @@ -158,7 +158,7 @@ ; GFX9-LABEL: v_test_imax_sge_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 @@ -166,12 +166,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v5 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_short_d16 v6, v[0:1], off offset:4 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v7, v[0:1], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v4, v[2:3], off offset:4 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v2, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v5 @@ -200,51 +203,51 @@ ; VI-LABEL: v_test_imax_sge_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v8 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v6, v1, v3 -; VI-NEXT: v_max_i16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_i16_e32 v3, v0, v2 -; VI-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v1, v6, v1 -; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: v_max_i16_e32 v2, v5, v7 +; VI-NEXT: v_max_i16_sdwa v3, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_i16_e32 v5, v4, v6 +; VI-NEXT: v_max_i16_sdwa v4, v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v3, v2, v3 +; VI-NEXT: v_or_b32_e32 v2, v5, v4 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v1, v1, v3 -; GFX9-NEXT: v_pk_max_i16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_pk_max_i16 v3, v5, v7 +; GFX9-NEXT: v_pk_max_i16 v2, v4, v6 +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid @@ -263,44 +266,44 @@ ; VI-LABEL: v_test_imax_sgt_i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: flat_load_ushort v1, v[2:3] +; VI-NEXT: flat_load_ushort v6, v[0:1] +; VI-NEXT: flat_load_ushort v7, v[2:3] ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v0, v0, v1 +; VI-NEXT: v_max_i16_e32 v0, v6, v7 ; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sgt_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-NEXT: global_load_ushort v7, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v0, v0, v1 +; GFX9-NEXT: v_max_i16_e32 v0, v6, v7 ; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -320,44 +323,44 @@ ; VI-LABEL: v_test_umax_uge_i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: flat_load_ushort v1, v[2:3] +; VI-NEXT: flat_load_ushort v6, v[0:1] +; VI-NEXT: flat_load_ushort v7, v[2:3] ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v0, v0, v1 +; VI-NEXT: v_max_u16_e32 v0, v6, v7 ; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_uge_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-NEXT: global_load_ushort v7, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_max_u16_e32 v0, v6, v7 ; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -377,44 +380,44 @@ ; VI-LABEL: v_test_umax_ugt_i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: flat_load_ushort v1, v[2:3] +; VI-NEXT: flat_load_ushort v6, v[0:1] +; VI-NEXT: flat_load_ushort v7, v[2:3] ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v0, v0, v1 +; VI-NEXT: v_max_u16_e32 v0, v6, v7 ; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_ugt_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-NEXT: global_load_ushort v7, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_max_u16_e32 v0, v6, v7 ; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -433,46 +436,46 @@ ; VI-LABEL: v_test_umax_ugt_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v3, v5, v2 -; VI-NEXT: v_max_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_max_u16_e32 v2, v5, v6 +; VI-NEXT: v_max_u16_sdwa v3, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_ugt_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_max_u16 v0, v6, v7 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll @@ -1,7 +1,7 @@ -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10WGP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10CU %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode,-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10CU %s ; GCN-LABEL: {{^}}system_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -85,10 +85,10 @@ ; SI: s_min_i32 ; SI: s_min_i32 -; VI: s_min_i32 ; VI: s_min_i32 ; VI: s_min_i32 ; VI: v_min_i32_sdwa +; VI: s_min_i32 ; GFX9_10: v_min_i16 ; GFX9_10: v_min_i16 @@ -495,8 +495,8 @@ ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: ; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} ; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} -; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] -; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; GCN: s_min_u32 [[MIN:s[0-9]+]], s{{[0-9]}}, s{{[0-9]}} +; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], s{{[0-9]}} ; GCN: buffer_store_dword [[VMIN]] ; EG: MIN_UINT diff --git a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll --- a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll +++ b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -enable-misched=0 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack -verify-machineinstrs -enable-misched=0 < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}sample_contig_nsa: ; GCN-DAG: image_sample_c_l v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], diff --git a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir --- a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: hazard_image_sample_d_buf_off6 # GCN: IMAGE_SAMPLE diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -648,6 +648,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -662,6 +663,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -679,6 +681,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -693,6 +696,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -710,6 +714,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -724,6 +729,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -743,6 +749,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -757,6 +764,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -776,6 +784,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -790,6 +799,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -809,6 +819,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -823,6 +834,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -842,6 +854,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -856,6 +869,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -873,6 +887,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -887,6 +902,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -906,6 +922,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -920,6 +937,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -939,6 +957,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -953,6 +972,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -972,6 +992,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -986,6 +1007,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1005,6 +1027,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1019,6 +1042,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1038,6 +1062,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1052,6 +1077,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1071,6 +1097,7 @@ ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1085,6 +1112,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1104,6 +1132,7 @@ ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1118,6 +1147,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1137,6 +1167,7 @@ ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1151,6 +1182,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1171,6 +1203,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1185,6 +1218,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1205,6 +1239,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1219,6 +1254,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1239,6 +1275,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1253,6 +1290,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1274,6 +1312,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1288,6 +1327,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1309,6 +1349,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x800, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1323,6 +1364,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1344,6 +1386,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1358,6 +1401,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1379,6 +1423,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1393,6 +1438,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1414,6 +1460,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1428,6 +1475,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm @@ -1449,6 +1497,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_byte v[0:1], v0 ; GFX9-NEXT: s_endpgm @@ -1463,6 +1512,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -636,6 +636,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -648,6 +649,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -665,6 +667,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -677,6 +680,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -694,6 +698,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -706,6 +711,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -725,6 +731,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -737,6 +744,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -754,6 +762,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -766,6 +775,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -783,6 +793,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -795,6 +806,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xfffff000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -814,6 +826,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -826,6 +839,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffe000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -843,6 +857,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -855,6 +870,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -874,6 +890,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -886,6 +903,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -905,6 +923,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -917,6 +936,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x3800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -934,6 +954,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -946,6 +967,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xfffff000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -965,6 +987,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -977,6 +1000,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffe000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -996,6 +1020,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1008,6 +1033,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffc000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1027,6 +1053,7 @@ ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1039,6 +1066,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1058,6 +1086,7 @@ ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1070,6 +1099,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1089,6 +1119,7 @@ ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1101,6 +1132,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1121,6 +1153,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1133,6 +1166,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1153,6 +1187,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1165,6 +1200,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1185,6 +1221,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1197,6 +1234,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x2000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1218,6 +1256,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1231,6 +1270,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1252,6 +1292,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1265,6 +1306,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1286,6 +1328,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1299,6 +1342,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1320,6 +1364,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1333,6 +1378,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1354,6 +1400,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1367,6 +1414,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1388,6 +1436,7 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1401,6 +1450,7 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_nop 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -94,9 +94,9 @@ ; FUNC-LABEL: {{^}}scalar_or_inline_imm_i64: ; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} ; SI-NOT: or_b32 -; SI: s_or_b32 s[[VAL_LO]], s[[VAL_LO]], 63 +; SI: s_or_b32 s[[OR:[0-9]+]], s[[VAL_LO]], 63 ; SI-NOT: or_b32 -; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[VAL_LO]] +; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[OR]] ; SI-NOT: or_b32 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]] ; SI-NOT: or_b32 @@ -121,9 +121,9 @@ ; FUNC-LABEL: {{^}}scalar_or_neg_inline_imm_i64: ; SI-DAG: s_load_dword [[VAL:s[0-9]+]] -; SI-DAG: s_or_b32 [[VAL]], [[VAL]], -8 +; SI-DAG: s_or_b32 [[OR:s[0-9]+]], [[VAL]], -8 ; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}} -; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]] +; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[OR]] ; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) { %or = or i64 %a, -8 diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir --- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir +++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir @@ -56,12 +56,12 @@ ; GCN: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: } ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { - ; GCN: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) + ; GCN: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) ; GCN: } ; GCN: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { - ; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) + ; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) ; GCN: } ; GCN: S_NOP 0 ; GCN: $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 464, 0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll b/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll --- a/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll @@ -47,10 +47,10 @@ ; GCN-LABEL: reassoc_v2i32: ; GCN: s_add_i32 [[ADD1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_add_i32 [[ADD2:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} -; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}} +; GCN-DAG: s_add_i32 [[ADD2:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} +; GFX8-DAG: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}} ; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD2]], v{{[0-9]+}} -; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}} +; GFX9-DAG: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}} ; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD2]], v{{[0-9]+}} define amdgpu_kernel void @reassoc_v2i32(<2 x i32> addrspace(1)* %arg, <2 x i32> %x, <2 x i32> %y) { bb: diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll --- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; TODO: Some of those tests fail with OS == amdhsa due to unreasonable register ; allocation differences. diff --git a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll --- a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}s_mulk_i32_k0: ; SI: s_load_dword [[VAL:s[0-9]+]] diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -37,18 +37,18 @@ ; VI-LABEL: saddo_i64_zext: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: s_add_u32 s2, s6, s0 -; VI-NEXT: s_addc_u32 s3, s7, s1 +; VI-NEXT: s_add_u32 s0, s6, s2 +; VI-NEXT: s_addc_u32 s1, s7, s3 ; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] -; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[1:2] +; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_xor_b64 s[2:3], s[8:9], vcc +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -58,18 +58,18 @@ ; GFX9-LABEL: saddo_i64_zext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: s_add_u32 s2, s6, s0 -; GFX9-NEXT: s_addc_u32 s3, s7, s1 +; GFX9-NEXT: s_add_u32 s0, s6, s2 +; GFX9-NEXT: s_addc_u32 s1, s7, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[1:2] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc @@ -113,16 +113,16 @@ ; VI-LABEL: s_saddo_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s1, 0 -; VI-NEXT: s_add_i32 s1, s0, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, s1, v4 -; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], s3, 0 +; VI-NEXT: s_add_i32 s3, s2, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, s3, v4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -133,12 +133,12 @@ ; GFX9-LABEL: s_saddo_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_add_i32 v4, s0, v4 clamp -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: s_add_i32 s0, s2, s3 +; GFX9-NEXT: v_add_i32 v4, s2, v4 clamp ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v5, s0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v4 @@ -362,21 +362,21 @@ ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v6, s2 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc -; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1] -; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] +; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v6 +; VI-NEXT: v_addc_u32_e32 v9, vcc, v5, v7, vcc +; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[4:5] +; VI-NEXT: flat_store_dwordx2 v[0:1], v[8:9] ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; VI-NEXT: flat_store_byte v[6:7], v0 +; VI-NEXT: flat_store_byte v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_saddo_i64: @@ -387,21 +387,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[8:9], off +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v7, vcc +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[4:5] +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: global_store_byte v[6:7], v0, off +; GFX9-NEXT: global_store_byte v[2:3], v0, off ; GFX9-NEXT: s_endpgm %a = load i64, i64 addrspace(1)* %aptr, align 4 %b = load i64, i64 addrspace(1)* %bptr, align 4 @@ -457,25 +457,25 @@ ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v6, s2 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3 -; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 -; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v5, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v6 +; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v7 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v5 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6 +; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v4 ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[8:9] ; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] -; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1] +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_saddo_v2i32: @@ -486,23 +486,23 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_i32 v8, v0, v2 clamp -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_add_i32 v2, v1, v3 clamp -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_add_i32 v8, v4, v6 clamp +; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 +; GFX9-NEXT: v_add_i32 v6, v5, v7 clamp +; GFX9-NEXT: v_add_u32_e32 v5, v5, v7 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v5, v6 +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -235,11 +235,11 @@ ; VI-LABEL: scalar_to_vector_test6: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -489,8 +489,8 @@ ; VI-NEXT: s_mov_b32 s15, s3 ; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 -; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 -; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v3, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(3) @@ -499,9 +499,9 @@ ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll --- a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}const_load_no_shrink_dword_to_unaligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013 define amdgpu_kernel void @const_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) { @@ -14,6 +15,7 @@ } ; GCN-LABEL: const_load_no_shrink_dword_to_aligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003 define amdgpu_kernel void @const_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) { @@ -27,6 +29,7 @@ } ; GCN-LABEL: global_load_no_shrink_dword_to_unaligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013 define amdgpu_kernel void @global_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %x) { @@ -40,6 +43,7 @@ } ; GCN-LABEL: global_load_no_shrink_dword_to_aligned_byte: +; GCN: s_load_dword s{{[0-9]+}} ; GCN: s_load_dword [[LD:s[0-9]+]], ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003 define amdgpu_kernel void @global_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %x) { diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,12 +8,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -21,20 +21,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x30 -; VI-NEXT: s_mov_b32 s3, 0xffff +; VI-NEXT: s_load_dword s3, s[0:1], 0x30 +; VI-NEXT: s_mov_b32 s1, 0xffff ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_lshr_b32 s8, s0, 16 -; VI-NEXT: s_and_b32 s2, s2, s3 -; VI-NEXT: s_and_b32 s0, s0, s3 -; VI-NEXT: s_lshl_b32 s0, s2, s0 -; VI-NEXT: s_lshl_b32 s1, s1, s8 -; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_and_b32 s0, s0, s3 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: s_lshr_b32 s8, s3, 16 +; VI-NEXT: s_and_b32 s2, s2, s1 +; VI-NEXT: s_and_b32 s3, s3, s1 +; VI-NEXT: s_lshl_b32 s0, s0, s8 +; VI-NEXT: s_lshl_b32 s2, s2, s3 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_and_b32 s1, s2, s1 +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -74,12 +74,12 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v5, v4 ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; @@ -94,14 +94,14 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v6, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5 -; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_lshlrev_b16_e32 v2, v6, v5 +; VI-NEXT: v_lshlrev_b16_sdwa v3, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -146,7 +146,7 @@ ; GFX9-LABEL: shl_v_s_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -157,27 +157,27 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s2, v0 ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_s_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: s_lshr_b32 s0, s2, 16 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v4, s0, v3 +; VI-NEXT: v_lshlrev_b16_e32 v4, s2, v3 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v2, v4, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -221,7 +221,7 @@ ; GFX9-LABEL: shl_s_v_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -232,27 +232,27 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, s0 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, s2 ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_s_v_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: s_lshr_b32 s0, s2, 16 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e64 v4, v3, s0 +; VI-NEXT: v_lshlrev_b16_e64 v4, v3, s2 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v4, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -429,45 +429,45 @@ ; GFX9-LABEL: v_shl_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v3 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_pk_lshlrev_b16 v3, v5, v3 +; GFX9-NEXT: v_pk_lshlrev_b16 v2, v4, v2 +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_shl_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v8 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1 -; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0 -; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v1, v6, v1 -; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: v_lshlrev_b16_e32 v2, v7, v5 +; VI-NEXT: v_lshlrev_b16_sdwa v3, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v5, v6, v4 +; VI-NEXT: v_lshlrev_b16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v3, v2, v3 +; VI-NEXT: v_or_b32_e32 v2, v5, v4 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_shl_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -30,13 +30,13 @@ ; FLAT-LABEL: break_inserted_outside_of_loop: ; FLAT: ; %bb.0: ; %main_body ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dword s0, s[0:1], 0x2c +; FLAT-NEXT: s_load_dword s2, s[0:1], 0x2c ; FLAT-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; FLAT-NEXT: s_mov_b64 s[0:1], 0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_and_b32_e32 v0, s0, v0 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v0 ; FLAT-NEXT: v_and_b32_e32 v0, 1, v0 ; FLAT-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; FLAT-NEXT: s_mov_b64 s[0:1], 0 ; FLAT-NEXT: BB0_1: ; %ENDIF ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLAT-NEXT: s_and_b64 s[2:3], exec, vcc diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir --- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir +++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir @@ -10,7 +10,7 @@ bb.0: ; GCN-LABEL: name: si-lower-control-flow ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 16, 0 + ; GCN: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 16, 0, 0 ; GCN: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0 = S_AND_B32 [[S_LOAD_DWORD_IMM]], 255, implicit-def $scc ; GCN: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0 = S_AND_B32 65535, [[S_AND_B32_]], implicit-def $scc ; GCN: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -19,12 +19,12 @@ ; VI-LABEL: s_sext_i1_to_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -52,17 +52,17 @@ ; ; VI-LABEL: test_s_sext_i32_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: s_add_i32 s0, s0, s2 +; VI-NEXT: s_mul_i32 s0, s4, s5 +; VI-NEXT: s_add_i32 s0, s0, s6 ; VI-NEXT: s_ashr_i32 s1, s0, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm entry: %mul = mul i32 %a, %b @@ -90,12 +90,12 @@ ; VI-LABEL: s_sext_i1_to_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -123,13 +123,13 @@ ; VI-LABEL: s_sext_i32_to_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s1, s0, 31 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_ashr_i32 s0, s2, 31 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %sext = sext i32 %a to i64 @@ -196,11 +196,11 @@ ; VI-LABEL: s_sext_i16_to_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 +; VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -227,12 +227,12 @@ ; VI-LABEL: s_sext_i1_to_i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -266,14 +266,14 @@ ; VI-LABEL: s_sext_i1_to_i16_with_and: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v0 +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s10, v1 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -304,17 +304,17 @@ ; ; VI-LABEL: v_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %cmp0 = icmp eq i32 %a, %tid @@ -361,22 +361,22 @@ ; VI-LABEL: s_sext_v4i8_to_v4i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s0 -; VI-NEXT: s_ashr_i32 s1, s0, 24 -; VI-NEXT: s_bfe_i32 s2, s0, 0x80010 -; VI-NEXT: s_sext_i32_i8 s0, s0 +; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; VI-NEXT: s_ashr_i32 s0, s2, 24 +; VI-NEXT: s_bfe_i32 s1, s2, 0x80010 +; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 -; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %cast = bitcast i32 %a to <4 x i8> %ext = sext <4 x i8> %cast to <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll --- a/llvm/test/CodeGen/AMDGPU/smed3.ll +++ b/llvm/test/CodeGen/AMDGPU/smed3.ll @@ -637,8 +637,8 @@ ; SI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; FIXME: VI not matching med3 -; VI: v_min_i16 -; VI: v_max_i16 +; VI-DAG: v_min_i16 +; VI-DAG: v_max_i16 ; VI: v_min_i16 ; VI: v_max_i16 diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -150,8 +150,8 @@ ; GCN-LABEL: {{^}}smrd_load_const1: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff glc ; encoding: [0xff -; VIGFX9_10-DAG: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc ; -; VIGFX9_10-DAG: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc glc ; +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x3fc ; +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x3fc glc ; define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 @@ -173,8 +173,8 @@ ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x400 +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x400 define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 @@ -194,8 +194,8 @@ ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0xffffc +; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0xffffc define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 @@ -211,8 +211,8 @@ ; SMRD load with an offset greater than the largest possible immediate on VI ; GCN-LABEL: {{^}}smrd_load_const4: ; SIVIGFX9_10: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 -; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] -; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]:[0-9]+}}], [[OFFSET]] +; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]:[0-9]+}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir --- a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GFX9 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GFX10 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX10 %s --- | define amdgpu_kernel void @check_vcc() #0 { @@ -36,9 +36,9 @@ bb.0: liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7 - ; CHECK-LABEL: name: check_vcc - ; CHECK: liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr9 + ; GFX9-LABEL: name: check_vcc + ; GFX9: liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr9 ; GFX9: $sgpr33 = S_MOV_B32 0 ; GFX9: $sgpr12 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; GFX9: $sgpr13 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 @@ -68,7 +68,8 @@ ; GFX9: $exec = S_MOV_B64 killed $vcc ; GFX9: $vcc_lo = V_READLANE_B32_vi $vgpr0, 0, implicit-def $vcc ; GFX9: $vcc_hi = V_READLANE_B32_vi killed $vgpr0, 1 - + ; GFX10-LABEL: name: check_vcc + ; GFX10: liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr9 ; GFX10: $sgpr33 = S_MOV_B32 0 ; GFX10: $sgpr96 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99 ; GFX10: $sgpr97 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99 diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -137,7 +137,7 @@ ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; GCN: NumVgprs: 10 -; GFX900: ScratchSize: 44 +; GFX900: ScratchSize: 52 ; GFX908: ScratchSize: 20 ; GCN: VGPRBlocks: 2 ; GCN: NumVGPRsForWavesPerEU: 10 @@ -246,7 +246,7 @@ ; GFX908-DAG: v_accvgpr_read_b32 ; GCN: NumVgprs: 256 -; GFX900: ScratchSize: 2052 +; GFX900: ScratchSize: 1028 ; GFX908-FIXME: ScratchSize: 0 ; GCN: VGPRBlocks: 63 ; GCN: NumVGPRsForWavesPerEU: 256 diff --git a/llvm/test/CodeGen/AMDGPU/spill192.mir b/llvm/test/CodeGen/AMDGPU/spill192.mir --- a/llvm/test/CodeGen/AMDGPU/spill192.mir +++ b/llvm/test/CodeGen/AMDGPU/spill192.mir @@ -30,7 +30,7 @@ ; EXPANDED: successors: %bb.1(0x80000000) ; EXPANDED: liveins: $vgpr0 ; EXPANDED: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED: $vgpr0 = V_WRITELANE_B32_gfx6_gfx7 killed $sgpr4, 0, undef $vgpr0 + ; EXPANDED: $vgpr0 = V_WRITELANE_B32_gfx6_gfx7 killed $sgpr4, 0, undef $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; EXPANDED: $vgpr0 = V_WRITELANE_B32_gfx6_gfx7 killed $sgpr5, 1, $vgpr0 ; EXPANDED: $vgpr0 = V_WRITELANE_B32_gfx6_gfx7 killed $sgpr6, 2, $vgpr0 ; EXPANDED: $vgpr0 = V_WRITELANE_B32_gfx6_gfx7 killed $sgpr7, 3, $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -6,14 +6,14 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { ; CIVI-LABEL: local_store_i56: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: ds_write_b32 v0, v1 -; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 -; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: ds_write_b32 v0, v1 +; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i56: ; GFX9: ; %bb.0: @@ -30,70 +30,70 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_or_b32 s0, s4, 14 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s5 -; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v1, s0 -; HAWAII-NEXT: v_mov_b32_e32 v2, s1 -; HAWAII-NEXT: v_mov_b32_e32 v3, s2 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 -; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 -; HAWAII-NEXT: ds_write_b32 v1, v2 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_or_b32 s0, s4, 14 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s5 +; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v1, s0 +; HAWAII-NEXT: v_mov_b32_e32 v2, s1 +; HAWAII-NEXT: v_mov_b32_e32 v3, s2 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 +; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 +; HAWAII-NEXT: ds_write_b32 v1, v2 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_or_b32 s0, s4, 14 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s5 -; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v1, s0 -; FIJI-NEXT: v_mov_b32_e32 v3, s1 -; FIJI-NEXT: s_and_b32 s3, s2, 0xffff -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 -; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 -; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 -; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 -; FIJI-NEXT: ds_write_b32 v1, v3 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_or_b32 s0, s4, 14 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s5 +; FIJI-NEXT: flat_load_ubyte v0, v[0:1] +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v1, s0 +; FIJI-NEXT: v_mov_b32_e32 v3, s1 +; FIJI-NEXT: s_and_b32 s3, s2, 0xffff +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 +; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 +; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 +; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 +; FIJI-NEXT: ds_write_b32 v1, v3 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_and_b32 s3, s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v2, s3, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 -; GFX9-NEXT: ds_write_b32 v0, v3 -; GFX9-NEXT: s_endpgm +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_and_b32 s3, s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v2, s3, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: ds_write_b32 v0, v3 +; GFX9-NEXT: s_endpgm store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void } @@ -101,31 +101,31 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 { ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v2, s1 -; HAWAII-NEXT: v_mov_b32_e32 v1, s2 -; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 -; HAWAII-NEXT: ds_write_b32 v0, v2 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v2, s1 +; HAWAII-NEXT: v_mov_b32_e32 v1, s2 +; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 +; HAWAII-NEXT: ds_write_b32 v0, v2 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v2, s1 -; FIJI-NEXT: v_mov_b32_e32 v1, s2 -; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 -; FIJI-NEXT: ds_write_b32 v0, v2 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v2, s1 +; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 +; FIJI-NEXT: ds_write_b32 v0, v2 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: @@ -146,35 +146,35 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 { ; HAWAII-LABEL: local_store_i65: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v2, s2 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: s_and_b32 s3, s3, 1 -; HAWAII-NEXT: v_mov_b32_e32 v3, s3 -; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8 -; HAWAII-NEXT: ds_write_b64 v2, v[0:1] -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: s_and_b32 s3, s3, 1 +; HAWAII-NEXT: v_mov_b32_e32 v3, s3 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8 +; HAWAII-NEXT: ds_write_b64 v2, v[0:1] +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i65: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: s_and_b32 s3, s3, 1 -; FIJI-NEXT: v_mov_b32_e32 v3, s3 -; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: ds_write_b8 v2, v3 offset:8 -; FIJI-NEXT: ds_write_b64 v2, v[0:1] -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: s_and_b32 s3, s3, 1 +; FIJI-NEXT: v_mov_b32_e32 v3, s3 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: ds_write_b8 v2, v3 offset:8 +; FIJI-NEXT: ds_write_b64 v2, v[0:1] +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i65: ; GFX9: ; %bb.0: @@ -218,22 +218,22 @@ define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 { ; CIVI-LABEL: local_store_i17: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; CIVI-NEXT: ds_write_b16 v0, v1 -; CIVI-NEXT: ds_write_b8 v0, v2 offset:2 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; CIVI-NEXT: ds_write_b16 v0, v1 +; CIVI-NEXT: ds_write_b8 v0, v2 offset:2 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i17: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x1ffff, v1 -; GFX9-NEXT: ds_write_b16 v0, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v2, 0x1ffff, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] store i17 %arg, i17 addrspace(3)* %ptr, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -18,35 +18,35 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v0, v4, v5 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v5, v[2:3] ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_e32 v2, v0, v1 -; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_sub_u16_e32 v0, v4, v5 +; VI-NEXT: v_sub_u16_sdwa v1, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -87,13 +87,13 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_load_dword s4, s[6:7], 0x0 -; VI-NEXT: s_load_dword s6, s[8:9], 0x0 +; VI-NEXT: s_load_dword s10, s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s4, 16 -; VI-NEXT: s_lshr_b32 s7, s6, 16 -; VI-NEXT: s_sub_i32 s4, s4, s6 -; VI-NEXT: s_sub_i32 s5, s5, s7 +; VI-NEXT: s_lshr_b32 s6, s10, 16 +; VI-NEXT: s_sub_i32 s4, s4, s10 +; VI-NEXT: s_sub_i32 s5, s5, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -129,11 +129,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -142,17 +142,17 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x30 +; VI-NEXT: s_load_dword s3, s[0:1], 0x30 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_lshr_b32 s3, s0, 16 -; VI-NEXT: s_sub_i32 s1, s1, s3 -; VI-NEXT: s_sub_i32 s0, s2, s0 -; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: s_sub_i32 s0, s0, s1 +; VI-NEXT: s_sub_i32 s1, s2, s3 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -401,21 +401,21 @@ ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v0, v4, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -424,22 +424,22 @@ ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v5, v[2:3] ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_e32 v0, v1, v2 -; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_sub_u16_e32 v0, v4, v5 +; VI-NEXT: v_sub_u16_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -459,22 +459,22 @@ ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v1, v4, v5 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -484,24 +484,24 @@ ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v5, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_e32 v0, v4, v2 -; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_sub_u16_e32 v0, v4, v5 +; VI-NEXT: v_sub_u16_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -521,21 +521,21 @@ ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v0, v4, v5 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -544,24 +544,24 @@ ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v5, v[2:3] ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_sub_u16_e32 v0, v0, v1 +; VI-NEXT: v_sub_u16_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_sub_u16_e32 v0, v4, v5 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 -; VI-NEXT: v_bfe_i32 v1, v2, 0, 16 +; VI-NEXT: v_bfe_i32 v1, v1, 0, 16 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -581,21 +581,21 @@ ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v1, v4, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 @@ -607,24 +607,24 @@ ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v5, v[2:3] ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_sub_u16_e32 v0, v0, v1 +; VI-NEXT: v_sub_u16_e32 v0, v4, v5 +; VI-NEXT: v_sub_u16_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_bfe_i32 v2, v1, 0, 16 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 -; VI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -104,15 +104,16 @@ ; VI-LABEL: truncate_high_elt_extract_vector: ; VI: ; %bb.0: ; %bb ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_nop 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[4:5], 0x0 -; VI-NEXT: s_load_dword s3, s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: s_load_dword s1, s[6:7], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s0, s2 -; VI-NEXT: s_sext_i32_i16 s1, s3 +; VI-NEXT: s_sext_i32_i16 s0, s0 +; VI-NEXT: s_sext_i32_i16 s1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mul_i32_i24_e32 v2, s1, v2 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll --- a/llvm/test/CodeGen/AMDGPU/umed3.ll +++ b/llvm/test/CodeGen/AMDGPU/umed3.ll @@ -672,8 +672,8 @@ ; SI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; FIXME: VI not matching med3 -; VI: v_min_u16 -; VI: v_max_u16 +; VI-DAG: v_min_u16 +; VI-DAG: v_max_u16 ; VI: v_min_u16 ; VI: v_max_u16 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -18,11 +18,12 @@ ; GFX9-LABEL: shuffle_v4f16_234u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -74,11 +75,12 @@ ; GFX9-LABEL: shuffle_v4f16_3u6u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -90,11 +92,12 @@ ; GFX9-LABEL: shuffle_v4f16_3uu7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -106,15 +109,15 @@ ; GFX9-LABEL: shuffle_v4f16_35u5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -126,14 +129,14 @@ ; GFX9-LABEL: shuffle_v4f16_357u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -173,9 +176,12 @@ ; GFX9-LABEL: shuffle_v4f16_0145: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -187,9 +193,12 @@ ; GFX9-LABEL: shuffle_v4f16_0167: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -229,9 +238,12 @@ ; GFX9-LABEL: shuffle_v4f16_2345: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -243,9 +255,12 @@ ; GFX9-LABEL: shuffle_v4f16_2367: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -257,11 +272,12 @@ ; GFX9-LABEL: shuffle_v4f16_4501: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -273,11 +289,12 @@ ; GFX9-LABEL: shuffle_v4f16_4523: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -316,11 +333,12 @@ ; GFX9-LABEL: shuffle_v4f16_6701: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -332,11 +350,12 @@ ; GFX9-LABEL: shuffle_v4f16_6723: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -376,13 +395,14 @@ ; GFX9-LABEL: shuffle_v4f16_2356: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -394,13 +414,14 @@ ; GFX9-LABEL: shuffle_v4f16_5623: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -412,14 +433,15 @@ ; GFX9-LABEL: shuffle_v4f16_3456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -431,15 +453,14 @@ ; GFX9-LABEL: shuffle_v4f16_5634: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -451,16 +472,16 @@ ; GFX9-LABEL: shuffle_v4f16_5734: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -472,13 +493,14 @@ ; GFX9-LABEL: shuffle_v4i16_2356: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 @@ -490,9 +512,12 @@ ; GFX9-LABEL: shuffle_v4i16_0167: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 @@ -556,12 +581,12 @@ ; GFX9-LABEL: shuffle_v4f16_6161: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -634,9 +659,12 @@ ; GFX9-LABEL: shuffle_v8f16_4589: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:8 -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:8 +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -648,11 +676,12 @@ ; GFX9-LABEL: shuffle_v8f16_10_11_2_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -664,13 +693,14 @@ ; GFX9-LABEL: shuffle_v8f16_13_14_2_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 -; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -713,14 +743,16 @@ ; GFX9-LABEL: shuffle_v6f16_452367: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-NEXT: global_load_dword v3, v[3:4], off +; GFX9-NEXT: global_load_dwordx3 v[0:2], v[5:6], off +; GFX9-NEXT: global_load_dword v7, v[3:4], off ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0 %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1 @@ -732,7 +764,7 @@ ; GFX9-LABEL: fma_shuffle: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -741,11 +773,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s8, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_fma_f16 v6, v0, v2, v6 op_sel_hi:[0,1,1] @@ -787,15 +821,15 @@ ; GFX9-LABEL: shuffle_v4f16_0456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v1, v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v7, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -10,21 +10,21 @@ ; GCN-NEXT: BB0_1: ; %bb0 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_2 Depth 2 -; GCN-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 +; GCN-NEXT: v_add_co_u32_e64 v6, vcc_lo, v0, 8 ; GCN-NEXT: s_mov_b32 s5, exec_lo -; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GCN-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] +; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: BB0_2: ; Parent Loop BB0_1 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s8, v4 -; GCN-NEXT: v_readfirstlane_b32 s9, v5 -; GCN-NEXT: v_readfirstlane_b32 s10, v2 -; GCN-NEXT: v_readfirstlane_b32 s11, v3 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5] -; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GCN-NEXT: v_readfirstlane_b32 s8, v2 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NEXT: v_readfirstlane_b32 s11, v5 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[2:3] +; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[4:5] ; GCN-NEXT: s_and_b32 s4, vcc_lo, s4 ; GCN-NEXT: s_and_saveexec_b32 s4, s4 ; GCN-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -109,6 +109,7 @@ ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[44:47] dmask:0x1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir --- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir @@ -31,7 +31,7 @@ ; CHECK-LABEL: name: undef_identity_copy ; CHECK: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1) ; CHECK: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc - ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 + ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95, implicit-def $scc ; CHECK: $sgpr4 = COPY $sgpr95 ; CHECK: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @foo, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4 ; CHECK: ADJCALLSTACKDOWN 0, 4, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9,GFX9_10 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX10,GFX9_10 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX10 %s --- | define amdgpu_kernel void @max-counter-lgkmcnt() #0 { ret void } @@ -21,25 +21,60 @@ ; GFX9-LABEL: name: max-counter-lgkmcnt ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec + ; GFX9: $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec + ; GFX9: $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec + ; GFX9: $vgpr6_vgpr7 = DS_READ2_B32_gfx9 renamable $vgpr99, 6, 7, 0, implicit $exec + ; GFX9: $vgpr8_vgpr9 = DS_READ2_B32_gfx9 renamable $vgpr99, 8, 9, 0, implicit $exec + ; GFX9: $vgpr10_vgpr11 = DS_READ2_B32_gfx9 renamable $vgpr99, 10, 11, 0, implicit $exec + ; GFX9: $vgpr12_vgpr13 = DS_READ2_B32_gfx9 renamable $vgpr99, 12, 13, 0, implicit $exec + ; GFX9: $vgpr14_vgpr15 = DS_READ2_B32_gfx9 renamable $vgpr99, 14, 15, 0, implicit $exec + ; GFX9: $vgpr16_vgpr17 = DS_READ2_B32_gfx9 renamable $vgpr99, 16, 17, 0, implicit $exec + ; GFX9: $vgpr18_vgpr19 = DS_READ2_B32_gfx9 renamable $vgpr99, 18, 19, 0, implicit $exec + ; GFX9: $vgpr20_vgpr21 = DS_READ2_B32_gfx9 renamable $vgpr99, 20, 21, 0, implicit $exec + ; GFX9: $vgpr22_vgpr23 = DS_READ2_B32_gfx9 renamable $vgpr99, 22, 23, 0, implicit $exec + ; GFX9: $vgpr24_vgpr25 = DS_READ2_B32_gfx9 renamable $vgpr99, 24, 25, 0, implicit $exec + ; GFX9: $vgpr26_vgpr27 = DS_READ2_B32_gfx9 renamable $vgpr99, 26, 27, 0, implicit $exec + ; GFX9: $vgpr28_vgpr29 = DS_READ2_B32_gfx9 renamable $vgpr99, 28, 29, 0, implicit $exec + ; GFX9: $vgpr30_vgpr31 = DS_READ2_B32_gfx9 renamable $vgpr99, 30, 31, 0, implicit $exec + ; GFX9: $vgpr32_vgpr33 = DS_READ2_B32_gfx9 renamable $vgpr99, 32, 33, 0, implicit $exec ; GFX9: $vgpr34_vgpr35 = DS_READ2_B32_gfx9 renamable $vgpr99, 34, 35, 0, implicit $exec - ; GFX9-NOT: S_WAITCNT 53119 - ; GFX9-NEXT: S_WAITCNT 52863 - ; GFX9-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; GFX9-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec - ; GFX9-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec - ; GFX9-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec - ; GFX9-NEXT: S_ENDPGM 0 + ; GFX9: S_WAITCNT 52863 + ; GFX9: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX9: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec + ; GFX9: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec + ; GFX9: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec + ; GFX9: S_ENDPGM 0 ; GFX10-LABEL: name: max-counter-lgkmcnt + ; GFX10: S_WAITCNT 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec + ; GFX10: $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec + ; GFX10: $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec + ; GFX10: $vgpr6_vgpr7 = DS_READ2_B32_gfx9 renamable $vgpr99, 6, 7, 0, implicit $exec + ; GFX10: $vgpr8_vgpr9 = DS_READ2_B32_gfx9 renamable $vgpr99, 8, 9, 0, implicit $exec + ; GFX10: $vgpr10_vgpr11 = DS_READ2_B32_gfx9 renamable $vgpr99, 10, 11, 0, implicit $exec + ; GFX10: $vgpr12_vgpr13 = DS_READ2_B32_gfx9 renamable $vgpr99, 12, 13, 0, implicit $exec + ; GFX10: $vgpr14_vgpr15 = DS_READ2_B32_gfx9 renamable $vgpr99, 14, 15, 0, implicit $exec + ; GFX10: $vgpr16_vgpr17 = DS_READ2_B32_gfx9 renamable $vgpr99, 16, 17, 0, implicit $exec + ; GFX10: $vgpr18_vgpr19 = DS_READ2_B32_gfx9 renamable $vgpr99, 18, 19, 0, implicit $exec + ; GFX10: $vgpr20_vgpr21 = DS_READ2_B32_gfx9 renamable $vgpr99, 20, 21, 0, implicit $exec + ; GFX10: $vgpr22_vgpr23 = DS_READ2_B32_gfx9 renamable $vgpr99, 22, 23, 0, implicit $exec + ; GFX10: $vgpr24_vgpr25 = DS_READ2_B32_gfx9 renamable $vgpr99, 24, 25, 0, implicit $exec + ; GFX10: $vgpr26_vgpr27 = DS_READ2_B32_gfx9 renamable $vgpr99, 26, 27, 0, implicit $exec + ; GFX10: $vgpr28_vgpr29 = DS_READ2_B32_gfx9 renamable $vgpr99, 28, 29, 0, implicit $exec + ; GFX10: $vgpr30_vgpr31 = DS_READ2_B32_gfx9 renamable $vgpr99, 30, 31, 0, implicit $exec + ; GFX10: $vgpr32_vgpr33 = DS_READ2_B32_gfx9 renamable $vgpr99, 32, 33, 0, implicit $exec ; GFX10: $vgpr34_vgpr35 = DS_READ2_B32_gfx9 renamable $vgpr99, 34, 35, 0, implicit $exec - ; GFX10-NEXT: S_WAITCNT 53631 - ; GFX10-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; GFX10-NEXT: S_WAITCNT 53375 - ; GFX10-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec - ; GFX10-NEXT: S_WAITCNT 53119 - ; GFX10-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec - ; GFX10-NEXT: S_WAITCNT 52863 - ; GFX10-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec - ; GFX10-NEXT: S_ENDPGM 0 + ; GFX10: S_WAITCNT 53631 + ; GFX10: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX10: S_WAITCNT 53375 + ; GFX10: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec + ; GFX10: S_WAITCNT 53119 + ; GFX10: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec + ; GFX10: S_WAITCNT 52863 + ; GFX10: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec + ; GFX10: S_ENDPGM 0 $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec @@ -72,17 +107,157 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 - ; GFX9_10-LABEL: name: max-counter-vmcnt - ; GFX9_10: $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, 0, 0, 0, implicit $exec - ; GFX9-NOT: S_WAITCNT 53119 - ; GFX10-NOT: S_WAITCNT 65407 - ; GFX9-NEXT: S_WAITCNT 53118 - ; GFX10-NEXT: S_WAITCNT 65406 - ; GFX9_10-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; GFX9_10-NEXT: $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec - ; GFX9_10-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec - ; GFX9_10-NEXT: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec - ; GFX9_10-NEXT: S_ENDPGM 0 + ; GFX9-LABEL: name: max-counter-vmcnt + ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 20, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 24, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 28, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 32, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 36, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 40, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 44, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 48, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 52, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 56, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 60, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 64, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 68, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 72, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 76, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 80, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 84, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 88, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 92, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 96, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 100, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 104, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 108, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 112, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 116, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 120, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 124, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 128, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 132, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr34 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 136, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr35 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 140, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr36 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 144, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr37 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 148, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr38 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 152, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr39 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 156, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 160, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 164, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 168, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 172, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 176, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 180, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 184, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 188, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr48 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 192, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr49 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 196, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr50 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 200, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr51 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 204, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr52 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 208, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr53 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 212, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr54 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 216, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr55 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 220, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 224, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 228, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 232, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 236, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 240, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 244, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 248, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 252, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr64 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 256, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr65 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 260, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_WAITCNT 53118 + ; GFX9: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX9: $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec + ; GFX9: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec + ; GFX9: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec + ; GFX9: S_ENDPGM 0 + ; GFX10-LABEL: name: max-counter-vmcnt + ; GFX10: S_WAITCNT 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 20, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 24, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 28, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 32, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 36, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 40, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 44, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 48, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 52, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 56, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 60, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 64, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 68, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 72, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 76, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 80, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 84, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 88, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 92, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 96, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 100, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 104, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 108, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 112, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 116, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 120, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 124, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 128, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 132, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr34 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 136, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr35 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 140, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr36 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 144, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr37 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 148, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr38 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 152, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr39 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 156, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 160, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 164, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 168, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 172, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 176, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 180, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 184, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 188, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr48 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 192, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr49 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 196, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr50 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 200, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr51 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 204, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr52 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 208, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr53 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 212, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr54 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 216, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr55 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 220, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 224, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 228, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 232, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 236, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 240, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 244, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 248, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 252, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr64 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 256, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr65 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 260, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: S_WAITCNT 65406 + ; GFX10: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX10: $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec + ; GFX10: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec + ; GFX10: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec + ; GFX10: S_ENDPGM 0 $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec @@ -164,9 +339,31 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX9_10-LABEL: name: max-counter-expcnt - ; GFX9_10: EXP - ; GFX9_10-NOT: S_WAITCNT + ; GFX9-LABEL: name: max-counter-expcnt + ; GFX9: S_WAITCNT 0 + ; GFX9: EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX9: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX9: S_ENDPGM 0 + ; GFX10-LABEL: name: max-counter-expcnt + ; GFX10: S_WAITCNT 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX10: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX10: S_ENDPGM 0 EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir @@ -44,8 +44,8 @@ ; GFX9-LABEL: name: gather_gather ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; GFX9: S_WAITCNT 0 - ; GFX9: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GFX9: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) + ; GFX9: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) ... @@ -62,9 +62,9 @@ ; GFX9-LABEL: name: nosampler_sampler ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9: S_WAITCNT 0 - ; GFX9: $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) ; GFX9: S_WAITCNT 3952 - ; GFX9: $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec + ; GFX9: $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 16) $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 16) ... diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -95,7 +95,7 @@ ; GFX9-LABEL: {{^}}call: define amdgpu_kernel void @call(<4 x i32> inreg %tmp14, i32 inreg %arg) { ; GFX9-O0: v_mov_b32_e32 v0, s0 -; GFX9-O3: v_mov_b32_e32 v2, s0 +; GFX9-O3: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 @@ -129,8 +129,8 @@ define amdgpu_kernel void @call_i64(<4 x i32> inreg %tmp14, i64 inreg %arg) { ; GFX9-O0: v_mov_b32_e32 v0, s0 ; GFX9-O0: v_mov_b32_e32 v1, s1 -; GFX9-O3: v_mov_b32_e32 v7, s1 -; GFX9-O3: v_mov_b32_e32 v6, s0 +; GFX9-O3: v_mov_b32_e32 v7, s3 +; GFX9-O3: v_mov_b32_e32 v6, s2 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll --- a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll @@ -2,18 +2,13 @@ ; RUN: llc -march=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s ; RUN: llc -march=amdgcn -mcpu=gfx802 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s -; RUN: llc -march=amdgcn -mcpu=gfx902 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -march=amdgcn -mcpu=gfx902 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s ; REQUIRES: asserts -; Some subtargets have a default setting of 'On' instead of 'Any' to maintain -; backwards compatibility. This is a temporary measure until the new TargetID is -; implemented. - ; NOT-SUPPORTED: XNACK setting for subtarget: Not Supported ; ANY: XNACK setting for subtarget: Any -; ON: XNACK setting for subtarget: On define void @xnack-subtarget-feature-any() #0 { ret void } diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -204,7 +204,7 @@ ; FUNC-LABEL: {{^}}scalar_xor_inline_imm_i64: ; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} ; SI-NOT: xor_b32 -; SI: s_xor_b32 s[[VAL_LO]], s{{[0-9]+}}, 63 +; SI: s_xor_b32 s{{[0-9]}}, s{{[0-9]+}}, 63 ; SI-NOT: xor_b32 ; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s{{[0-9]+}} ; SI-NOT: xor_b32 diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll --- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll @@ -54,7 +54,7 @@ ; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} ; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], [[MASK]] ; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], [[MASK]] -; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]] +; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], s{{[0-9]}} ; GCN: v_cmp_eq_u32_e32 vcc, [[MASK_A]], [[V_B]] ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc diff --git a/llvm/test/MC/AMDGPU/xnack-mask.s b/llvm/test/MC/AMDGPU/xnack-mask.s --- a/llvm/test/MC/AMDGPU/xnack-mask.s +++ b/llvm/test/MC/AMDGPU/xnack-mask.s @@ -1,10 +1,12 @@ // RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti -show-encoding %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 %s // RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii -show-encoding %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -mattr=-xnack -show-encoding %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 %s // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1001 -show-encoding %s 2>&1 | FileCheck -check-prefix=NOSICIVI10 %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -show-encoding %s 2>&1 | FileCheck -check-prefix=XNACKERR %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -show-encoding %s | FileCheck -check-prefix=XNACK %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -mattr=+xnack -show-encoding %s 2>&1 | FileCheck -check-prefix=XNACKERR %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=stoney -mattr=+xnack -show-encoding %s | FileCheck -check-prefix=XNACK %s + +; FIXME: Incorrect diagnostics after changing the defaults for xnack and sramecc. s_mov_b64 xnack_mask, -1 // NOSICIVI10: error: not a valid operand.