Index: llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp =================================================================== --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -70,8 +70,16 @@ TargetOccupancy = MFI.getOccupancy(); SGPRCriticalLimit = std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); - VGPRCriticalLimit = - std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit); + + // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except returns + // a reasonably small number for targets with lots of VGPRs, such as GFX10 and + // GFX11. + unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST); + unsigned VGPRBudget = + alignDown(AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST) / TargetOccupancy, + Granule); + VGPRBudget = std::max(VGPRBudget, Granule); + VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit); // Subtract error margin from register limits and avoid overflow. SGPRCriticalLimit = Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -505,8 +505,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_ashrrev_i32_e32 v0, v16, v0 +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, v17, v1 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, v18, v2 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, v19, v3 @@ -522,15 +522,15 @@ ; GFX10-NEXT: v_ashrrev_i32_e32 v13, v29, v13 ; GFX10-NEXT: v_ashrrev_i32_e32 v14, v30, v14 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_ashrrev_i32_e32 v15, v31, v15 +; GFX10-NEXT: v_ashrrev_i32_e32 v15, v16, v15 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_ashr_v16i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: v_ashrrev_i32_e32 v0, v16, v0 +; GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, v17, v1 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, v18, v2 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, v19, v3 @@ -546,7 +546,7 @@ ; GFX11-NEXT: v_ashrrev_i32_e32 v13, v29, v13 ; GFX11-NEXT: v_ashrrev_i32_e32 v14, v30, v14 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_ashrrev_i32_e32 v15, v31, v15 +; GFX11-NEXT: v_ashrrev_i32_e32 v15, v16, v15 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = ashr <16 x i32> %value, %amount ret <16 x i32> %result @@ -1762,22 +1762,22 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_bfe_i32 v4, v2, 0, 1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v3 -; GFX10-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[8:9], v3, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 -; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v3 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], v2, v[4:5] -; GFX10-NEXT: v_ashrrev_i64 v[10:11], v10, v[4:5] -; GFX10-NEXT: v_or_b32_e32 v2, v6, v8 -; GFX10-NEXT: v_or_b32_e32 v8, v7, v9 -; GFX10-NEXT: v_ashrrev_i64 v[6:7], v3, v[4:5] -; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v8, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], v2, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v2, v8, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 64, v3 +; GFX10-NEXT: v_or_b32_e32 v8, v9, v7 +; GFX10-NEXT: v_ashrrev_i64 v[6:7], v6, v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v1, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc_lo +; GFX10-NEXT: v_ashrrev_i64 v[2:3], v3, v[4:5] +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v6, v1, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_ashr_i65: @@ -1844,18 +1844,31 @@ ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_ashr_i65_33: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v1 -; GFX10PLUS-NEXT: v_bfe_i32 v1, v2, 0, 1 -; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] -; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 1, v2 -; GFX10PLUS-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_ashr_i65_33: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_bfe_i32 v0, v2, 0, 1 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 1, v1 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] +; GFX10-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_ashr_i65_33: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: v_bfe_i32 v1, v2, 0, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 1, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = ashr i65 %value, 33 ret i65 %result } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll @@ -176,12 +176,12 @@ ; GFX10-LABEL: test_v4f16_v4f32_add_fma_ext_mul: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_pk_mul_f16 v12, v12, v14 -; GFX10-NEXT: v_pk_mul_f16 v13, v13, v15 ; GFX10-NEXT: v_fma_mix_f32 v0, v0, v4, v12 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_pk_mul_f16 v4, v13, v15 ; GFX10-NEXT: v_fma_mix_f32 v1, v1, v5, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] -; GFX10-NEXT: v_fma_mix_f32 v2, v2, v6, v13 op_sel_hi:[0,0,1] -; GFX10-NEXT: v_fma_mix_f32 v3, v3, v7, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; GFX10-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX10-NEXT: v_fma_mix_f32 v2, v2, v6, v4 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_fma_mix_f32 v3, v3, v7, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; GFX10-NEXT: v_add_f32_e32 v1, v1, v9 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v10 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v11 @@ -190,12 +190,12 @@ ; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_fma_ext_mul: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v12, v12, v14 -; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v13, v13, v15 ; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v0, v4, v12 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v4, v13, v15 ; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v1, v1, v5, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] -; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v2, v2, v6, v13 op_sel_hi:[0,0,1] -; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v3, v3, v7, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v2, v2, v6, v4 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v3, v3, v7, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v9 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v10 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v11 @@ -204,12 +204,12 @@ ; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_fma_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: v_pk_mul_f16 v12, v12, v14 -; GFX10-DENORM-NEXT: v_pk_mul_f16 v13, v13, v15 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v4, v12 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_pk_mul_f16 v4, v13, v15 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v1, v5, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] -; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v2, v6, v13 op_sel_hi:[0,0,1] -; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v3, v7, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v2, v6, v4 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v3, v7, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v9 ; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v10 ; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v11 @@ -245,33 +245,33 @@ ; GFX10-LABEL: test_v4f16_v4f32_add_ext_fma_mul: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_pk_mul_f16 v8, v8, v10 -; GFX10-NEXT: v_pk_mul_f16 v9, v9, v11 -; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v8 -; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v9 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v8, v1 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_add_f32_e32 v0, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v1, v3, v5 -; GFX10-NEXT: v_add_f32_e32 v2, v8, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v9, v7 +; GFX10-NEXT: v_pk_fma_f16 v2, v0, v2, v8 +; GFX10-NEXT: v_pk_mul_f16 v0, v9, v11 +; GFX10-NEXT: v_pk_fma_f16 v3, v1, v3, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_ext_fma_mul: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v8, v8, v10 -; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v9, v9, v11 -; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v8 -; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v9 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v8, v1 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v2, v4 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v3, v5 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v8, v6 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v9, v7 +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v2, v0, v2, v8 +; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v0, v9, v11 +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v3, v1, v3, v0 +; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_ext_fma_mul: @@ -280,16 +280,16 @@ ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v2, v9, v11 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v8 -; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v2 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v2, v4 -; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v3, v5 -; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v8, v6 -; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v9, v7 +; GFX10-DENORM-NEXT: v_pk_add_f16 v3, v0, v8 +; GFX10-DENORM-NEXT: v_pk_add_f16 v8, v1, v2 +; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v3 +; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul <4 x half> %u, %v @@ -318,12 +318,12 @@ ; GFX10-LABEL: test_v4f16_v4f32_add_fma_ext_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_pk_mul_f16 v12, v12, v14 -; GFX10-NEXT: v_pk_mul_f16 v13, v13, v15 ; GFX10-NEXT: v_fma_mix_f32 v4, v4, v8, v12 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_pk_mul_f16 v8, v13, v15 ; GFX10-NEXT: v_fma_mix_f32 v5, v5, v9, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] -; GFX10-NEXT: v_fma_mix_f32 v6, v6, v10, v13 op_sel_hi:[0,0,1] -; GFX10-NEXT: v_fma_mix_f32 v7, v7, v11, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_fma_mix_f32 v6, v6, v10, v8 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_fma_mix_f32 v7, v7, v11, v8 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; GFX10-NEXT: v_add_f32_e32 v1, v1, v5 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 @@ -332,12 +332,12 @@ ; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_fma_ext_mul_rhs: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v12, v12, v14 -; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v13, v13, v15 ; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v4, v4, v8, v12 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v8, v13, v15 ; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v5, v5, v9, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] -; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v6, v6, v10, v13 op_sel_hi:[0,0,1] -; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v7, v7, v11, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v6, v6, v10, v8 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v7, v7, v11, v8 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v5 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v7 @@ -346,12 +346,12 @@ ; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_fma_ext_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: v_pk_mul_f16 v12, v12, v14 -; GFX10-DENORM-NEXT: v_pk_mul_f16 v13, v13, v15 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v4, v4, v8, v12 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_pk_mul_f16 v8, v13, v15 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v5, v5, v9, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] -; GFX10-DENORM-NEXT: v_fma_mix_f32 v6, v6, v10, v13 op_sel_hi:[0,0,1] -; GFX10-DENORM-NEXT: v_fma_mix_f32 v7, v7, v11, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v6, v6, v10, v8 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v7, v7, v11, v8 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v5 ; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v7 @@ -387,33 +387,33 @@ ; GFX10-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_pk_mul_f16 v8, v8, v10 -; GFX10-NEXT: v_pk_mul_f16 v9, v9, v11 ; GFX10-NEXT: v_pk_fma_f16 v4, v4, v6, v8 -; GFX10-NEXT: v_pk_fma_f16 v5, v5, v7, v9 +; GFX10-NEXT: v_pk_mul_f16 v6, v9, v11 +; GFX10-NEXT: v_pk_fma_f16 v5, v5, v7, v6 ; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v4 ; GFX10-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v8, v8, v10 -; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v9, v9, v11 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v4, v4, v6, v8 -; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v5, v5, v7, v9 +; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v6, v9, v11 +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v5, v5, v7, v6 ; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v6, v4 ; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs: @@ -426,12 +426,12 @@ ; GFX10-DENORM-NEXT: v_pk_add_f16 v5, v5, v6 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v6, v4 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul <4 x half> %u, %v Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll @@ -552,27 +552,28 @@ ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: s_clause 0x8 +; GFX10-CONTRACT-NEXT: s_clause 0x6 +; GFX10-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX10-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX10-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GFX10-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; GFX10-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; GFX10-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 +; GFX10-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(5) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] +; GFX10-CONTRACT-NEXT: s_clause 0x1 ; GFX10-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX10-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX10-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX10-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; GFX10-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GFX10-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; GFX10-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; GFX10-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; GFX10-CONTRACT-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(6) -; GFX10-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(4) -; GFX10-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(2) -; GFX10-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] -; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] +; GFX10-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(5) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(3) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] ; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] ; GFX10-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] ; GFX10-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] ; GFX10-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; @@ -580,27 +581,28 @@ ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-DENORM-NEXT: s_clause 0x8 +; GFX10-DENORM-NEXT: s_clause 0x6 +; GFX10-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX10-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX10-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GFX10-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; GFX10-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; GFX10-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 +; GFX10-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(5) +; GFX10-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] +; GFX10-DENORM-NEXT: s_clause 0x1 ; GFX10-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX10-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX10-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX10-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; GFX10-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GFX10-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; GFX10-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; GFX10-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; GFX10-DENORM-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GFX10-DENORM-NEXT: s_waitcnt vmcnt(6) -; GFX10-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX10-DENORM-NEXT: s_waitcnt vmcnt(4) -; GFX10-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX10-DENORM-NEXT: s_waitcnt vmcnt(2) -; GFX10-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] -; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] +; GFX10-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(5) +; GFX10-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(3) +; GFX10-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] ; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] ; GFX10-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] ; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] ; GFX10-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: @@ -667,27 +669,28 @@ ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: s_clause 0x8 +; GFX10-CONTRACT-NEXT: s_clause 0x6 +; GFX10-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX10-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX10-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GFX10-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; GFX10-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; GFX10-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 +; GFX10-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(5) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] +; GFX10-CONTRACT-NEXT: s_clause 0x1 ; GFX10-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX10-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX10-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX10-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; GFX10-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GFX10-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; GFX10-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; GFX10-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; GFX10-CONTRACT-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(6) -; GFX10-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(4) -; GFX10-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(2) -; GFX10-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] -; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] +; GFX10-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(5) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(3) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] ; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] ; GFX10-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] ; GFX10-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] ; GFX10-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; @@ -695,27 +698,28 @@ ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-DENORM-NEXT: s_clause 0x8 +; GFX10-DENORM-NEXT: s_clause 0x6 +; GFX10-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX10-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX10-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GFX10-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; GFX10-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; GFX10-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 +; GFX10-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(5) +; GFX10-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] +; GFX10-DENORM-NEXT: s_clause 0x1 ; GFX10-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX10-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX10-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX10-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; GFX10-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GFX10-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; GFX10-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; GFX10-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; GFX10-DENORM-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GFX10-DENORM-NEXT: s_waitcnt vmcnt(6) -; GFX10-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX10-DENORM-NEXT: s_waitcnt vmcnt(4) -; GFX10-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX10-DENORM-NEXT: s_waitcnt vmcnt(2) -; GFX10-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] -; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] +; GFX10-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(5) +; GFX10-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(3) +; GFX10-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] ; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] ; GFX10-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] ; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] ; GFX10-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll @@ -97,12 +97,12 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_xor_b32_e32 v8, 0x80008000, v2 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v9, 0x80008000, v3 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v5, v0, -v2, -v5 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v2, -v4 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, 0x80008000, v3 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v1, -v3, -v7 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v8, -v4 op_sel_hi:[1,1,0] -; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v1, v9, -v6 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v1, v2, -v6 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: @@ -131,12 +131,12 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_xor_b32_e32 v8, 0x80008000, v2 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v9, 0x80008000, v3 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v5, v0, -v2, -v5 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v2, -v4 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, 0x80008000, v3 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v1, -v3, -v7 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v8, -v4 op_sel_hi:[1,1,0] -; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v1, v9, -v6 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v1, v2, -v6 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: @@ -166,12 +166,12 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul2: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_xor_b32_e32 v8, 0x80008000, v6 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v9, 0x80008000, v7 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, -v4, -v6, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, 0x80008000, v6 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, -v5, -v7, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v4, v8, v0 op_sel_hi:[1,1,0] -; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, -v5, v9, v2 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v4, v6, v0 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_xor_b32_e32 v4, 0x80008000, v7 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, -v5, v4, v2 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast <4 x half> %y, %z @@ -199,12 +199,12 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul2: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_xor_b32_e32 v8, 0x80008000, v6 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v9, 0x80008000, v7 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, -v4, -v6, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, 0x80008000, v6 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, -v5, -v7, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v4, v8, v0 op_sel_hi:[1,1,0] -; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, -v5, v9, v2 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v4, v6, v0 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_xor_b32_e32 v4, 0x80008000, v7 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, -v5, v4, v2 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast <4 x half> %y, %z Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -342,26 +342,29 @@ ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 6, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 7, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v3 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v12, v8, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v13, v9, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v14, v8, v10, s4 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v9, v11, s4 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v10, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v11, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v3 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v5, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v10, v12, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v11, v13, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v15, v5, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v5, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v12, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v17, v13, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v16, v10, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v17, v11, v7, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v7, s4 @@ -370,87 +373,87 @@ ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v3 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v12, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v13, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v14, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v15, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v14, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v15, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_load_b128 v[16:19], v[0:1], off -; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16 -; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:32 -; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:48 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v17, v19, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v18 :: v_dual_add_nc_u32 v1, 1, v0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v18, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v19, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 -; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_cndmask_b32 v3, v3, v7 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off +; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off offset:16 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v8 :: v_dual_cndmask_b32 v3, v3, v9 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v11, v3, v5 :: v_dual_cndmask_b32 v12, v4, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 7, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v16 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v5, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v7, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v1 -; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v12 :: v_dual_cndmask_b32 v3, v3, v13 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v6, v6, v10 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v9, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v14, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v12, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v15, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, v14, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:32 +; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:48 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v16 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v5, v8 :: v_dual_cndmask_b32 v1, v6, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v8, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v9, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v10, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v15, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v11, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v12, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v13, s1 +; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v12 :: v_dual_cndmask_b32 v4, v4, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v14, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v15, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, v14, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v4, v15, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i128>, ptr addrspace(1) %ptr %element = extractelement <4 x i128> %vector, i32 %idx @@ -670,56 +673,57 @@ ; ; GFX10-LABEL: extractelement_sgpr_v4i128_vgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 +; GFX10-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 2, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v2 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_cndmask_b32_e32 v4, s4, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, s5, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, s4, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, s5, v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s9, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s9, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s11, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 4, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s13, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s13, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s15, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s15, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 6, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s16, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s17, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s16, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s17, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 7, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, s18, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, s19, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s10 +; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s12, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s14, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s16, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s18, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s20, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s22, s5 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s11 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s9, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s15, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s17, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s19, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s21, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s23, s5 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 3, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 5, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 6, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 2, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, s9, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 7, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s13, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s15, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s16, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s17, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s18, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s19, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s20, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s21, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s22, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s23, s7 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -738,16 +738,16 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 1, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -1566,16 +1566,16 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 2, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 2, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -699,21 +699,21 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 -; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_div_fixup_f16 v0, v2, v3, v0 +; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16: @@ -924,21 +924,21 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 -; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_div_fixup_f16 v0, v2, v3, v0 +; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16_ulp25: @@ -1619,21 +1619,21 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 -; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_div_fixup_f16 v0, v2, v3, v0 +; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -88,15 +88,15 @@ ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -247,15 +247,15 @@ ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -385,15 +385,15 @@ ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v1, s4, v0, v0, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v1 ; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v2 -; GFX10-IEEE-NEXT: v_fma_f32 v5, -v1, v3, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v2 -; GFX10-IEEE-NEXT: v_fma_f32 v1, -v1, v3, v4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v3 +; GFX10-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -539,15 +539,15 @@ ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v1, s4, v0, v0, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v1 ; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v2 -; GFX10-IEEE-NEXT: v_fma_f32 v5, -v1, v3, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v2 -; GFX10-IEEE-NEXT: v_fma_f32 v1, -v1, v3, v4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v3 +; GFX10-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -786,15 +786,15 @@ ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -973,27 +973,26 @@ ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX10-IEEE-NEXT: v_div_scale_f32 v6, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v8, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX10-IEEE-NEXT: v_div_scale_f32 v7, vcc_lo, v1, v3, v1 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v7, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v8, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v8, v9, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v8, v7 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v6, v8 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1012,20 +1011,20 @@ ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v7, v8, v5 ; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v6, s4, v3, v3, v1 ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v6 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v3, v1 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v3, v3, v1 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 -; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v6, v5, 1.0 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v4, v5 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v5 -; GFX10-FLUSH-NEXT: v_fma_f32 v7, -v6, v4, v2 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v7, v5 -; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v6, v4, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v6, v7, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v5, v4 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1224,27 +1223,26 @@ ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX10-IEEE-NEXT: v_div_scale_f32 v6, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v8, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX10-IEEE-NEXT: v_div_scale_f32 v7, vcc_lo, v1, v3, v1 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v7, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v8, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v8, v9, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v8, v7 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v6, v8 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1256,14 +1254,14 @@ ; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s4 ; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v3| ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x2f800000, s4 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-IEEE-LABEL: v_fdiv_v2f32_ulp25: @@ -1448,27 +1446,26 @@ ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v1, 1.0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1487,20 +1484,20 @@ ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v1, v1, 1.0 ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v4 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v1, 1.0 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v3 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 -; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3 -; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v3, v2, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v5, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v4 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v3, -v3, v5, v4 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v2, v5 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1709,27 +1706,26 @@ ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v1, 1.0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1748,20 +1744,20 @@ ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v1, v1, 1.0 ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v4 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v1, 1.0 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v3 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 -; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3 -; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v3, v2, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v5, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v4 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v3, -v3, v5, v4 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v2, v5 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -2070,27 +2066,26 @@ ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX10-IEEE-NEXT: v_div_scale_f32 v6, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v8, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX10-IEEE-NEXT: v_div_scale_f32 v7, vcc_lo, v1, v3, v1 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v7, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v8, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v8, v9, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v8, v7 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v6, v8 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -2102,14 +2097,14 @@ ; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s4 ; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v3| ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x2f800000, s4 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll @@ -71,15 +71,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[2:3], v[2:3], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX10-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] -; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -214,15 +214,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[2:3], v[2:3], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX10-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] -; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -309,15 +309,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_div_scale_f64 v[2:3], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; GFX10-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; GFX10-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[6:7], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; GFX10-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -404,15 +404,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_div_scale_f64 v[2:3], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; GFX10-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; GFX10-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[6:7], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; GFX10-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -548,15 +548,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_div_scale_f64 v[2:3], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; GFX10-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; GFX10-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[6:7], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; GFX10-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -691,15 +691,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[2:3], v[2:3], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX10-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] -; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -824,27 +824,26 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] ; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_div_scale_f64 v[14:15], vcc_lo, v[2:3], v[6:7], v[2:3] +; GFX10-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -854,8 +853,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], v[0:1] ; GFX11-NEXT: v_div_scale_f64 v[10:11], null, v[6:7], v[6:7], v[2:3] -; GFX11-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -869,20 +867,21 @@ ; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX11-NEXT: v_div_scale_f64 v[16:17], s0, v[2:3], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, v[0:1], v[4:5], v[0:1] ; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX11-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] +; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] +; GFX11-NEXT: v_div_scale_f64 v[16:17], s0, v[2:3], v[6:7], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX11-NEXT: v_mul_f64 v[20:21], v[16:17], v[14:15] ; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[16:17] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> %a, %b @@ -916,21 +915,21 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] -; GFX10-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] -; GFX10-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11] -; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1] -; GFX10-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3] -; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] -; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[10:11], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[10:11], v[8:9], v[8:9] +; GFX10-NEXT: v_mul_f64 v[10:11], v[0:1], v[8:9] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[10:11], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[6:7] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[10:11] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[6:7], v[8:9], v[2:3] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f64_afn: @@ -1061,27 +1060,26 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] ; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_div_scale_f64 v[14:15], vcc_lo, v[2:3], v[6:7], v[2:3] +; GFX10-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1091,8 +1089,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], v[0:1] ; GFX11-NEXT: v_div_scale_f64 v[10:11], null, v[6:7], v[6:7], v[2:3] -; GFX11-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -1106,20 +1103,21 @@ ; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX11-NEXT: v_div_scale_f64 v[16:17], s0, v[2:3], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, v[0:1], v[4:5], v[0:1] ; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX11-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] +; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] +; GFX11-NEXT: v_div_scale_f64 v[16:17], s0, v[2:3], v[6:7], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX11-NEXT: v_mul_f64 v[20:21], v[16:17], v[14:15] ; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[16:17] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> %a, %b, !fpmath !0 @@ -1223,27 +1221,26 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, 1.0, v[2:3], 1.0 +; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1385,27 +1382,26 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, 1.0, v[2:3], 1.0 +; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1477,21 +1473,21 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[0:1] -; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[2:3] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] -; GFX10-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] -; GFX10-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] -; GFX10-NEXT: v_mul_f64 v[8:9], 1.0, v[4:5] -; GFX10-NEXT: v_mul_f64 v[10:11], 1.0, v[6:7] -; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] -; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[0:1], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[0:1], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], 1.0, v[4:5] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] +; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], 1.0, v[4:5] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_rcp_v2f64_arcp_afn: @@ -1623,27 +1619,26 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, 1.0, v[2:3], 1.0 +; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1715,21 +1710,21 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] -; GFX10-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] -; GFX10-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11] -; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1] -; GFX10-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3] -; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] -; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[10:11], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[10:11], v[8:9], v[8:9] +; GFX10-NEXT: v_mul_f64 v[10:11], v[0:1], v[8:9] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[10:11], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[6:7] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[10:11] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[6:7], v[8:9], v[2:3] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f64_afn_ulp25: @@ -1860,27 +1855,26 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] ; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_div_scale_f64 v[14:15], vcc_lo, v[2:3], v[6:7], v[2:3] +; GFX10-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,8 +1884,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], v[0:1] ; GFX11-NEXT: v_div_scale_f64 v[10:11], null, v[6:7], v[6:7], v[2:3] -; GFX11-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -1905,20 +1898,21 @@ ; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX11-NEXT: v_div_scale_f64 v[16:17], s0, v[2:3], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, v[0:1], v[4:5], v[0:1] ; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX11-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] +; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] +; GFX11-NEXT: v_div_scale_f64 v[16:17], s0, v[2:3], v[6:7], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX11-NEXT: v_mul_f64 v[20:21], v[16:17], v[14:15] ; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[16:17] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x double> %a, %b, !fpmath !0 @@ -1952,21 +1946,21 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] -; GFX10-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] -; GFX10-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11] -; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1] -; GFX10-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3] -; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] -; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[10:11], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[10:11], v[8:9], v[8:9] +; GFX10-NEXT: v_mul_f64 v[10:11], v[0:1], v[8:9] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[10:11], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[6:7] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[10:11] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[6:7], v[8:9], v[2:3] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f64_arcp_afn_ulp25: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -345,15 +345,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f32_e32 v4, v1, v2 -; GFX10-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX10-NEXT: v_min_f32_e32 v2, v4, v3 +; GFX10-NEXT: v_min_f32_e32 v3, v1, v2 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX10-NEXT: v_max_f32_e32 v1, v3, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1033,18 +1033,18 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f32_e32 v4, v1, v2 +; GFX10-NEXT: v_min_f32_e32 v3, v1, v2 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX10-NEXT: v_min_f32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v1, v2 -; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX10-NEXT: global_store_dword v[0:1], v3, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -899,28 +899,28 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 -; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 7, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b16 v3, v3, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 7, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX10-NEXT: v_lshrrev_b16 v4, 1, v4 +; GFX10-NEXT: v_lshlrev_b16 v0, v5, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b16 v3, v3, v5 -; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_lshrrev_b16 v4, v6, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v3, v4 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16 v5, 1, v5 +; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX10-NEXT: v_lshrrev_b16 v4, v4, v5 +; GFX10-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_v2i8: @@ -1388,54 +1388,54 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX10-NEXT: v_lshrrev_b16 v4, 1, v4 +; GFX10-NEXT: v_lshrrev_b16 v3, v3, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX10-NEXT: v_lshlrev_b16 v5, v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v8 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX10-NEXT: v_mov_b32_e32 v13, 0xff -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-NEXT: v_and_b32_e32 v12, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX10-NEXT: v_lshlrev_b16 v3, v8, v3 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v11 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_lshrrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX10-NEXT: v_lshrrev_b16 v4, v4, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v6 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX10-NEXT: v_lshlrev_b16 v7, v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v8, 0xff +; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX10-NEXT: v_lshrrev_b16 v8, 1, v8 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX10-NEXT: v_lshrrev_b16 v6, v6, v8 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_lshlrev_b16 v8, v8, v0 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v2 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 -; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7 -; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 -; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12 -; GFX10-NEXT: v_lshrrev_b16 v6, v10, v6 -; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v8, v1 -; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5 -; GFX10-NEXT: v_lshrrev_b16 v5, v13, v7 -; GFX10-NEXT: v_lshrrev_b16 v7, v9, v12 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, 8 -; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_lshlrev_b16 v0, v9, v0 +; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v8, v3 +; GFX10-NEXT: v_or_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_v4i8: @@ -2321,13 +2321,13 @@ ; GFX10-NEXT: v_and_or_b32 v2, v0, 0xff, v2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_bfe_u32 v4, v1, 8, 8 -; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 8, v4 ; GFX10-NEXT: v_or3_b32 v0, v2, v0, v3 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_bfe_u32 v2, v1, 8, 8 +; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 8, v2 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_v2i24: @@ -2637,51 +2637,51 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX10-NEXT: v_bfe_u32 v3, v3, 1, 23 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX10-NEXT: v_mul_lo_u32 v8, 0xffffffe8, v6 -; GFX10-NEXT: v_mul_lo_u32 v9, 0xffffffe8, v7 -; GFX10-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX10-NEXT: v_mul_hi_u32 v9, v7, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v9 +; GFX10-NEXT: v_mul_lo_u32 v7, 0xffffffe8, v6 +; GFX10-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v7 ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 ; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX10-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v4, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, v5, v3 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_lo_u32 v4, 0xffffffe8, v2 +; GFX10-NEXT: v_mul_hi_u32 v4, v2, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v5 +; GFX10-NEXT: v_mul_hi_u32 v2, v4, v2 +; GFX10-NEXT: v_mul_lo_u32 v2, v2, 24 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v4, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, 24, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, 24, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, v2, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_v2i24: @@ -3207,11 +3207,11 @@ ; GFX10-NEXT: v_alignbit_b32 v2, v0, v2, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX10-NEXT: v_alignbit_b32 v3, v1, v3, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX10-NEXT: v_alignbit_b32 v2, v1, v3, 1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v2, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_v2i32: @@ -3291,15 +3291,15 @@ ; GFX10-NEXT: v_alignbit_b32 v3, v0, v3, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX10-NEXT: v_alignbit_b32 v4, v1, v4, 1 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX10-NEXT: v_alignbit_b32 v3, v1, v4, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 -; GFX10-NEXT: v_alignbit_b32 v5, v2, v5, 1 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v7 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v4 +; GFX10-NEXT: v_alignbit_b32 v3, v2, v5, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7 -; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v8 +; GFX10-NEXT: v_alignbit_b32 v2, v2, v3, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_v3i32: @@ -3395,19 +3395,19 @@ ; GFX10-NEXT: v_alignbit_b32 v4, v0, v4, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX10-NEXT: v_alignbit_b32 v5, v1, v5, 1 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX10-NEXT: v_alignbit_b32 v4, v1, v5, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v9 -; GFX10-NEXT: v_alignbit_b32 v6, v2, v6, 1 +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v9 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v5 +; GFX10-NEXT: v_alignbit_b32 v4, v2, v6, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v10 -; GFX10-NEXT: v_alignbit_b32 v7, v3, v7, 1 +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v10 +; GFX10-NEXT: v_alignbit_b32 v2, v2, v4, v5 +; GFX10-NEXT: v_alignbit_b32 v4, v3, v7, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9 -; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10 -; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11 +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v11 +; GFX10-NEXT: v_alignbit_b32 v3, v3, v4, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_v4i32: @@ -4889,19 +4889,19 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 -; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v5, 0xf000f, v5 -; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v6 ; GFX10-NEXT: v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v7, 0xf000f, v7 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v4, v0 -; GFX10-NEXT: v_pk_lshlrev_b16 v1, v5, v1 -; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2 -; GFX10-NEXT: v_pk_lshrrev_b16 v3, v7, v3 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v5 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, v4, v2 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v6, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX10-NEXT: v_pk_lshrrev_b16 v3, v4, v3 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5266,19 +5266,19 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 -; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v5, 0xf000f, v5 -; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v6 ; GFX10-NEXT: v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v7, 0xf000f, v7 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v4, v0 -; GFX10-NEXT: v_pk_lshlrev_b16 v1, v5, v1 -; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2 -; GFX10-NEXT: v_pk_lshrrev_b16 v3, v7, v3 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v5 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, v4, v2 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v6, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX10-NEXT: v_pk_lshrrev_b16 v3, v4, v3 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5442,12 +5442,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5956,22 +5956,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v8 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 +; GFX10-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] -; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX10-NEXT: v_and_b32_e32 v9, 63, v9 -; GFX10-NEXT: v_and_b32_e32 v10, 63, v10 -; GFX10-NEXT: v_and_b32_e32 v11, 63, v11 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3] -; GFX10-NEXT: v_lshrrev_b64 v[6:7], v11, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v10 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 63, v10 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[6:7] +; GFX10-NEXT: v_and_b32_e32 v6, 63, v8 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v6, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_v2i64: @@ -6392,106 +6392,102 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v8 +; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v8 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] -; GFX10-NEXT: v_lshlrev_b32_e32 v12, 31, v6 +; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v13 +; GFX10-NEXT: v_lshlrev_b64 v[11:12], v13, v[2:3] +; GFX10-NEXT: v_and_b32_e32 v15, 0x7f, v8 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v13 +; GFX10-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1] +; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v15 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v15 +; GFX10-NEXT: v_or_b32_e32 v14, v9, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 31, v6 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v18 -; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8 -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3] -; GFX10-NEXT: v_or_b32_e32 v5, v5, v12 -; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v9, v[0:1] -; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX10-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 -; GFX10-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] -; GFX10-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v8, v[6:7] -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 -; GFX10-NEXT: v_or_b32_e32 v14, v14, v16 -; GFX10-NEXT: v_or_b32_e32 v15, v15, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v14, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s4 -; GFX10-NEXT: v_or_b32_e32 v0, v12, v4 -; GFX10-NEXT: v_or_b32_e32 v1, v7, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX10-NEXT: v_or_b32_e32 v12, v10, v12 +; GFX10-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[10:11], v15, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v10, v10, v8 +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v13 +; GFX10-NEXT: v_or_b32_e32 v11, v11, v9 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v13, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v2, s4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v3, s4 +; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v15 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v2, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v3, v5, s5 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v15, v[6:7] +; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s4 +; GFX10-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX10-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_and_b32_e32 v18, 0x7f, v8 -; GFX11-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX11-NEXT: v_and_b32_e32 v14, 0x7f, v8 +; GFX11-NEXT: v_xor_b32_e32 v12, -1, v8 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v18 -; GFX11-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3] -; GFX11-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 -; GFX11-NEXT: v_lshlrev_b32_e32 v12, 31, v6 -; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v8 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v9, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v14 +; GFX11-NEXT: v_lshlrev_b64 v[8:9], v14, v[2:3] +; GFX11-NEXT: v_and_b32_e32 v16, 0x7f, v12 +; GFX11-NEXT: v_lshlrev_b64 v[12:13], v14, v[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v14 +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 31, v6 +; GFX11-NEXT: v_subrev_nc_u32_e32 v17, 64, v14 ; GFX11-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v5, v5, v12 -; GFX11-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] -; GFX11-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX11-NEXT: v_or_b32_e32 v11, v9, v11 ; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo -; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 -; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo -; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v8, v[6:7] -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v19 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v14, v14, v16 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v14, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v15, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v1, s0 -; GFX11-NEXT: v_or_b32_e32 v0, v12, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v1, v7, v5 -; GFX11-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v16 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v15 +; GFX11-NEXT: v_or_b32_e32 v15, v10, v8 +; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v16 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v17, v[0:1] +; GFX11-NEXT: v_or_b32_e32 v18, v11, v9 +; GFX11-NEXT: v_subrev_nc_u32_e32 v17, 64, v16 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v16, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v15, v0, v15 :: v_dual_cndmask_b32 v18, v1, v18 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v17, v[6:7] +; GFX11-NEXT: v_lshrrev_b64 v[6:7], v16, v[6:7] +; GFX11-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v8, 0, v13, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v15, v2, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v18, v3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v5, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, v7, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) ret i128 %result @@ -6656,50 +6652,50 @@ ; ; GFX10-LABEL: v_fshl_i128_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0x7f, v0 ; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX10-NEXT: s_lshl_b32 s9, s6, 31 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] -; GFX10-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v5 +; GFX10-NEXT: v_and_b32_e32 v7, 0x7f, v0 +; GFX10-NEXT: v_lshlrev_b64 v[3:4], v5, s[2:3] ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] -; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 -; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, s[8:9] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 -; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1] -; GFX10-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 64, v13 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] -; GFX10-NEXT: v_or_b32_e32 v3, v3, v1 -; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[6:7] -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 -; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 -; GFX10-NEXT: v_or_b32_e32 v7, v7, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s8, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, s2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, s3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s9, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 -; GFX10-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX10-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 64, v7 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v5 +; GFX10-NEXT: v_or_b32_e32 v6, v1, v3 +; GFX10-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, s[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v7, s[4:5] +; GFX10-NEXT: v_or_b32_e32 v8, v2, v0 +; GFX10-NEXT: v_or_b32_e32 v9, v3, v1 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, s[0:1] +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v5 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 64, v7 +; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v7 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v0, s[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v7, s[6:7] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, s5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, s2, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, s1 +; GFX10-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX10-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v8 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshl_i128_ssv: @@ -6953,35 +6949,35 @@ ; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX10-NEXT: s_sub_i32 s0, 64, s4 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX10-NEXT: s_sub_i32 s2, 64, s4 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] -; GFX10-NEXT: s_sub_i32 s0, s4, 64 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s2, v[2:3] +; GFX10-NEXT: s_sub_i32 s2, s4, 64 ; GFX10-NEXT: s_cmp_lt_u32 s4, 64 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_cselect_b32 s3, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX10-NEXT: v_or_b32_e32 v6, v4, v6 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: s_and_b32 s0, 1, s1 -; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_and_b32 s0, 1, s5 -; GFX10-NEXT: s_and_b32 s1, 1, s1 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 -; GFX10-NEXT: v_or_b32_e32 v0, s8, v0 -; GFX10-NEXT: v_or_b32_e32 v1, s9, v1 -; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 -; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX10-NEXT: s_and_b32 s6, 1, s3 +; GFX10-NEXT: v_or_b32_e32 v7, v5, v7 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s2, v[2:3] +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 +; GFX10-NEXT: s_and_b32 s2, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10-NEXT: s_and_b32 s2, 1, s3 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s4, v[2:3] +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v0, s8, v4 +; GFX10-NEXT: v_or_b32_e32 v1, s9, v5 +; GFX10-NEXT: v_or_b32_e32 v2, s0, v2 +; GFX10-NEXT: v_or_b32_e32 v3, s1, v3 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshl_i128_svs: @@ -7214,53 +7210,53 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX10-NEXT: s_sub_i32 s5, s8, 64 -; GFX10-NEXT: s_sub_i32 s6, 64, s8 -; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] +; GFX10-NEXT: s_andn2_b64 s[10:11], s[6:7], s[4:5] +; GFX10-NEXT: s_sub_i32 s9, 64, s8 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] -; GFX10-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s9, v[0:1] +; GFX10-NEXT: s_sub_i32 s4, s8, 64 +; GFX10-NEXT: s_cmp_lt_u32 s8, 64 +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_and_b32 s6, 1, s9 +; GFX10-NEXT: v_or_b32_e32 v6, v4, v6 +; GFX10-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-NEXT: s_and_b32 s5, 1, s5 +; GFX10-NEXT: v_or_b32_e32 v7, v5, v7 +; GFX10-NEXT: v_lshlrev_b64 v[4:5], s8, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 +; GFX10-NEXT: s_and_b32 s4, 1, s6 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_lshl_b32 s9, s2, 31 -; GFX10-NEXT: s_mov_b32 s8, s7 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX10-NEXT: s_and_b32 s5, 1, s10 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 +; GFX10-NEXT: s_lshl_b32 s5, s2, 31 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_sub_i32 s10, s4, 64 -; GFX10-NEXT: s_sub_i32 s8, 64, s4 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 -; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX10-NEXT: s_cmp_lt_u32 s4, 64 -; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo +; GFX10-NEXT: s_sub_i32 s11, s10, 64 +; GFX10-NEXT: s_sub_i32 s6, 64, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, v3, s4 +; GFX10-NEXT: s_mov_b32 s4, s7 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_cmp_lt_u32 s10, 64 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc_lo ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo -; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX10-NEXT: s_cmp_eq_u32 s10, 0 +; GFX10-NEXT: s_cselect_b32 s13, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s10 +; GFX10-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[2:3], s10 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s11 ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX10-NEXT: s_cmp_lg_u32 s13, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 -; GFX10-NEXT: v_or_b32_e32 v1, s1, v7 +; GFX10-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v4 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], 0 +; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX10-NEXT: ; return to shader part epilog @@ -7422,10 +7418,10 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[6:7] -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 31, v7 -; GFX10-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 31, v5 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX10-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_i128_65: @@ -8172,185 +8168,203 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v27, 0x7f, v16 +; GFX10-NEXT: v_and_b32_e32 v23, 0x7f, v16 ; GFX10-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 31, v10 -; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v27 -; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v16 -; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] -; GFX10-NEXT: v_or_b32_e32 v9, v9, v21 -; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 -; GFX10-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1] -; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28 -; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 -; GFX10-NEXT: v_or_b32_e32 v18, v16, v18 -; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 -; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] -; GFX10-NEXT: v_or_b32_e32 v19, v17, v19 -; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v28 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v28 -; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] +; GFX10-NEXT: v_and_b32_e32 v27, 0x7f, v20 +; GFX10-NEXT: v_lshrrev_b64 v[12:13], 1, v[12:13] +; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v23 +; GFX10-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] +; GFX10-NEXT: v_and_b32_e32 v24, 0x7f, v16 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v23 +; GFX10-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v27 -; GFX10-NEXT: v_or_b32_e32 v23, v23, v25 -; GFX10-NEXT: v_or_b32_e32 v24, v24, v26 -; GFX10-NEXT: v_cndmask_b32_e32 v19, v1, v19, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v18, v0, v18, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v23, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v17, v24, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v22, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v22, v19, v3, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v16, v8, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v9, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v0, s4 -; GFX10-NEXT: v_and_b32_e32 v23, 0x7f, v20 -; GFX10-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v20 -; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v1, s4 -; GFX10-NEXT: v_or_b32_e32 v1, v11, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v23 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v9 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] +; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v24 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v24 +; GFX10-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX10-NEXT: v_lshlrev_b32_e32 v17, 31, v10 +; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] +; GFX10-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX10-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] +; GFX10-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] +; GFX10-NEXT: v_or_b32_e32 v25, v18, v16 +; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v27 +; GFX10-NEXT: v_or_b32_e32 v26, v19, v17 +; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v28, v16, v18 ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 31, v14 -; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v3 -; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[12:13], v23, v[6:7] ; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] -; GFX10-NEXT: v_or_b32_e32 v9, v9, v16 -; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v25 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v23 -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v23, v[4:5] -; GFX10-NEXT: v_or_b32_e32 v12, v10, v12 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v25 -; GFX10-NEXT: v_lshrrev_b64 v[18:19], v25, v[8:9] -; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 -; GFX10-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] -; GFX10-NEXT: v_or_b32_e32 v5, v11, v13 -; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[14:15] -; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v25 -; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v16, v18, v20 -; GFX10-NEXT: v_or_b32_e32 v18, v19, v21 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v3, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[3:4], v25, v[14:15] -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v25 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v23 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v18, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v6, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v5, v7, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v10, v8, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v9, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v4, s4 -; GFX10-NEXT: v_or_b32_e32 v3, v22, v24 -; GFX10-NEXT: v_or_b32_e32 v4, v13, v5 -; GFX10-NEXT: v_or_b32_e32 v5, v14, v8 -; GFX10-NEXT: v_or_b32_e32 v6, v6, v9 -; GFX10-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX10-NEXT: v_or_b32_e32 v29, v17, v19 +; GFX10-NEXT: v_or_b32_e32 v13, v13, v16 +; GFX10-NEXT: v_xor_b32_e32 v16, -1, v20 +; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v16 +; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v20 +; GFX10-NEXT: v_lshrrev_b64 v[18:19], v20, v[12:13] +; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 0, v20 +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[14:15] +; GFX10-NEXT: v_or_b32_e32 v18, v18, v16 +; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v23 +; GFX10-NEXT: v_or_b32_e32 v19, v19, v17 +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v21, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v22, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v2, s4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v24 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v3, s4 +; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v24 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v25, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v26, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v2, v8, s5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v27 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v3, v9, s5 +; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v27 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[4:5], v27, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v28, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v29, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v2, v6, s6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v20 +; GFX10-NEXT: v_cndmask_b32_e64 v22, v3, v7, s6 +; GFX10-NEXT: v_cmp_gt_u32_e64 s6, 64, v20 +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v20, v[14:15] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[14:15] +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v6, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v7, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v18, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v19, s6 +; GFX10-NEXT: v_or_b32_e32 v6, v21, v6 +; GFX10-NEXT: v_or_b32_e32 v7, v22, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v2, v12, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v3, v13, s7 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v24, v[10:11] +; GFX10-NEXT: v_or_b32_e32 v4, v4, v12 +; GFX10-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s4 +; GFX10-NEXT: v_or_b32_e32 v2, v16, v2 +; GFX10-NEXT: v_or_b32_e32 v3, v17, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_v2i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX11-NEXT: v_and_b32_e32 v27, 0x7f, v16 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 31, v10 +; GFX11-NEXT: v_and_b32_e32 v23, 0x7f, v16 ; GFX11-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] +; GFX11-NEXT: v_and_b32_e32 v27, 0x7f, v20 +; GFX11-NEXT: v_lshrrev_b64 v[12:13], 1, v[12:13] +; GFX11-NEXT: v_sub_nc_u32_e32 v17, 64, v23 +; GFX11-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] +; GFX11-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 31, v10 +; GFX11-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX11-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_sub_nc_u32_e32 v17, 64, v27 -; GFX11-NEXT: v_or_b32_e32 v9, v9, v21 -; GFX11-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 -; GFX11-NEXT: v_and_b32_e32 v28, 0x7f, v16 -; GFX11-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1] -; GFX11-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] -; GFX11-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo -; GFX11-NEXT: v_sub_nc_u32_e32 v25, 64, v28 -; GFX11-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v28 -; GFX11-NEXT: v_or_b32_e32 v18, v16, v18 -; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 -; GFX11-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] -; GFX11-NEXT: v_or_b32_e32 v19, v17, v19 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v28 -; GFX11-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] -; GFX11-NEXT: v_or_b32_e32 v23, v23, v25 -; GFX11-NEXT: v_or_b32_e32 v24, v24, v26 -; GFX11-NEXT: v_dual_cndmask_b32 v18, v0, v18 :: v_dual_cndmask_b32 v19, v1, v19 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v27 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v23, s0 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v10, v17, v24, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v11, 0, v22, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v22, v19, v3, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v16, v8, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v2, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v10, v9, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v0, s0 -; GFX11-NEXT: v_and_b32_e32 v23, 0x7f, v20 -; GFX11-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v20 -; GFX11-NEXT: v_cndmask_b32_e64 v24, 0, v1, s0 -; GFX11-NEXT: v_or_b32_e32 v1, v11, v8 -; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v23 -; GFX11-NEXT: v_or_b32_e32 v2, v2, v9 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] +; GFX11-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v24 +; GFX11-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] +; GFX11-NEXT: v_or_b32_e32 v25, v18, v16 +; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v26, v19, v17 +; GFX11-NEXT: v_lshlrev_b64 v[18:19], v27, v[6:7] +; GFX11-NEXT: v_lshrrev_b64 v[16:17], v16, v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v28, v16, v18 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 31, v14 -; GFX11-NEXT: v_and_b32_e32 v25, 0x7f, v3 -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] -; GFX11-NEXT: v_lshlrev_b64 v[12:13], v23, v[6:7] ; GFX11-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] -; GFX11-NEXT: v_or_b32_e32 v9, v9, v16 -; GFX11-NEXT: v_sub_nc_u32_e32 v20, 64, v25 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v23 -; GFX11-NEXT: v_lshlrev_b64 v[16:17], v23, v[4:5] -; GFX11-NEXT: v_or_b32_e32 v12, v10, v12 -; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v25 -; GFX11-NEXT: v_lshrrev_b64 v[18:19], v25, v[8:9] -; GFX11-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 -; GFX11-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] -; GFX11-NEXT: v_or_b32_e32 v5, v11, v13 -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[14:15] -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v25 -; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v16, v18, v20 -; GFX11-NEXT: v_or_b32_e32 v18, v19, v21 -; GFX11-NEXT: v_dual_cndmask_b32 v12, v3, v12 :: v_dual_cndmask_b32 v5, v4, v5 -; GFX11-NEXT: v_lshrrev_b64 v[3:4], v25, v[14:15] -; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v16, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v25 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v23 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v18, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v10, v8, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v11, v9, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v4, s0 -; GFX11-NEXT: v_or_b32_e32 v3, v22, v24 -; GFX11-NEXT: v_or_b32_e32 v4, v13, v5 -; GFX11-NEXT: v_or_b32_e32 v5, v14, v8 -; GFX11-NEXT: v_or_b32_e32 v6, v6, v9 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX11-NEXT: v_or_b32_e32 v29, v17, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v13, v13, v16 +; GFX11-NEXT: v_xor_b32_e32 v16, -1, v20 +; GFX11-NEXT: v_and_b32_e32 v20, 0x7f, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v20 +; GFX11-NEXT: v_lshrrev_b64 v[18:19], v20, v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[14:15] +; GFX11-NEXT: v_or_b32_e32 v18, v18, v16 +; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 64, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v19, v19, v17 +; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v21, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v22, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v24 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v2, s0 +; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 64, v24 +; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v3, s0 +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v24 +; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v0 :: v_dual_cndmask_b32 v1, 0, v1 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, v[10:11] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v25, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v26, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v8, v2, v8, s1 +; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 64, v27 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v3, v9, s1 +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX11-NEXT: v_cndmask_b32_e64 v22, v2, v28, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v23, v3, v29, s1 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v24, v[10:11] +; GFX11-NEXT: v_dual_cndmask_b32 v6, v22, v6 :: v_dual_cndmask_b32 v7, v23, v7 +; GFX11-NEXT: v_lshlrev_b64 v[4:5], v27, v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v3, s0 +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v20 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v20 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v20, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e64 v21, 0, v4, s1 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], v3, v[14:15] +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v20 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, v5, s1 +; GFX11-NEXT: v_or_b32_e32 v2, v16, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v18, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v19, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v11, v3, v12, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v4, v13, s2 +; GFX11-NEXT: v_or_b32_e32 v3, v17, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v4, v21, v11 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v12 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) ret <2 x i128> %result Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -895,28 +895,28 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v2 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX10-NEXT: v_lshlrev_b16 v4, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 -; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_e32 v6, 7, v5 +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_lshlrev_b16 v3, v3, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: v_or_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, v5, v0 +; GFX10-NEXT: v_lshrrev_b16 v4, v6, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_v2i8: @@ -1387,54 +1387,54 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5 -; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_and_b32_e32 v3, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GFX10-NEXT: v_lshlrev_b16 v5, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX10-NEXT: v_lshrrev_b16 v3, v3, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v6 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX10-NEXT: v_lshlrev_b16 v9, 1, v9 +; GFX10-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_lshlrev_b16 v4, v4, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX10-NEXT: v_lshrrev_b16 v5, v7, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX10-NEXT: v_lshlrev_b16 v7, 1, v7 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_lshlrev_b16 v6, v6, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v7 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX10-NEXT: v_mov_b32_e32 v13, 0xff -; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12 -; GFX10-NEXT: v_lshlrev_b16 v3, v10, v3 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v11 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 -; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 +; GFX10-NEXT: v_lshlrev_b16 v8, v8, v9 +; GFX10-NEXT: v_mov_b32_e32 v9, 0xff +; GFX10-NEXT: v_and_b32_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX10-NEXT: v_lshrrev_b16 v7, v7, v9 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v5, v5, v7 -; GFX10-NEXT: v_lshlrev_b16 v4, v10, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v11, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v13, v6 -; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9 -; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, 8 -; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_or_b32_e32 v3, v6, v5 +; GFX10-NEXT: v_or_b32_e32 v4, v8, v7 +; GFX10-NEXT: v_lshlrev_b16 v0, v9, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_v4i8: @@ -2328,15 +2328,15 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX10-NEXT: v_bfe_u32 v4, v1, 8, 8 -; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX10-NEXT: v_and_or_b32 v2, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 8, v4 ; GFX10-NEXT: v_or3_b32 v0, v2, v0, v3 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_bfe_u32 v2, v1, 8, 8 +; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 8, v2 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_v2i24: @@ -2654,53 +2654,53 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX10-NEXT: v_mul_lo_u32 v8, 0xffffffe8, v6 -; GFX10-NEXT: v_mul_lo_u32 v9, 0xffffffe8, v7 -; GFX10-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX10-NEXT: v_mul_hi_u32 v9, v7, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v9 +; GFX10-NEXT: v_mul_lo_u32 v7, 0xffffffe8, v6 +; GFX10-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v7 ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 ; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX10-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v4 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v7 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, v6, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, v4, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v4, v2 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_lo_u32 v4, 0xffffffe8, v2 +; GFX10-NEXT: v_mul_hi_u32 v4, v2, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v5 +; GFX10-NEXT: v_mul_hi_u32 v2, v4, v2 +; GFX10-NEXT: v_mul_lo_u32 v2, v2, 24 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v4, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, 24, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, 24, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, v2, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_v2i24: @@ -4908,19 +4908,19 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 -; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v5, 0xf000f, v5 -; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v6 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v7, 0xf000f, v7 -; GFX10-NEXT: v_pk_lshrrev_b16 v2, v4, v2 -; GFX10-NEXT: v_pk_lshrrev_b16 v3, v5, v3 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v6, v0 -; GFX10-NEXT: v_pk_lshlrev_b16 v1, v7, v1 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v5 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v4, v0 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX10-NEXT: v_pk_lshrrev_b16 v3, v6, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v4, v1 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5385,19 +5385,19 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 -; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v5, 0xf000f, v5 -; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v6 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v7, 0xf000f, v7 -; GFX10-NEXT: v_pk_lshrrev_b16 v2, v4, v2 -; GFX10-NEXT: v_pk_lshrrev_b16 v3, v5, v3 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v6, v0 -; GFX10-NEXT: v_pk_lshlrev_b16 v1, v7, v1 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v5 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v4, v0 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX10-NEXT: v_pk_lshrrev_b16 v3, v6, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v4, v1 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5591,12 +5591,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] ; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6102,22 +6102,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v8 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 +; GFX10-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] ; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX10-NEXT: v_and_b32_e32 v9, 63, v9 -; GFX10-NEXT: v_and_b32_e32 v11, 63, v11 -; GFX10-NEXT: v_and_b32_e32 v10, 63, v10 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] -; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v10 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX10-NEXT: v_and_b32_e32 v4, 63, v10 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v4, v[6:7] +; GFX10-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_v2i64: @@ -6537,49 +6537,49 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v8 +; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v8 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 31, v1 -; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v13 +; GFX10-NEXT: v_lshrrev_b64 v[11:12], v13, v[4:5] +; GFX10-NEXT: v_and_b32_e32 v15, 0x7f, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v13 +; GFX10-NEXT: v_lshlrev_b64 v[9:10], v9, v[6:7] +; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v15 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v15 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v15 +; GFX10-NEXT: v_or_b32_e32 v14, v11, v9 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 31, v1 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v9 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX10-NEXT: v_subrev_nc_u32_e32 v21, 64, v19 -; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] -; GFX10-NEXT: v_lshrrev_b64 v[12:13], v19, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] -; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[14:15], v18, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 -; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v19 -; GFX10-NEXT: v_or_b32_e32 v12, v12, v16 -; GFX10-NEXT: v_or_b32_e32 v10, v10, v8 -; GFX10-NEXT: v_or_b32_e32 v11, v11, v9 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v21, v[6:7] -; GFX10-NEXT: v_or_b32_e32 v13, v13, v17 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v12, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v13, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s4 -; GFX10-NEXT: v_or_b32_e32 v0, v14, v4 -; GFX10-NEXT: v_or_b32_e32 v1, v7, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX10-NEXT: v_or_b32_e32 v12, v12, v10 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v15, v[2:3] +; GFX10-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v15 +; GFX10-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v15, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v2, s4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v3, s4 +; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v13 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v14, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v12, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v2, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v3, v5, s5 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, v[6:7] +; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s4 +; GFX10-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX10-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_i128: @@ -6587,52 +6587,57 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_xor_b32_e32 v9, -1, v8 +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 31, v1 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v18, 0x7f, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v14, 0x7f, v9 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b64 v[14:15], v18, v[0:1] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 -; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v8 -; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX11-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 -; GFX11-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] -; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc_lo -; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19 +; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v14 +; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 64, v14 +; GFX11-NEXT: v_lshlrev_b64 v[12:13], v14, v[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v14 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v14 ; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] -; GFX11-NEXT: v_subrev_nc_u32_e32 v21, 64, v19 -; GFX11-NEXT: v_lshrrev_b64 v[12:13], v19, v[4:5] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] -; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19 -; GFX11-NEXT: v_or_b32_e32 v10, v10, v8 -; GFX11-NEXT: v_or_b32_e32 v11, v11, v9 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v21, v[6:7] -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v19 -; GFX11-NEXT: v_or_b32_e32 v12, v12, v16 -; GFX11-NEXT: v_or_b32_e32 v13, v13, v17 -; GFX11-NEXT: v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v15, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v12, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v13, s0 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v15, 0x7f, v8 +; GFX11-NEXT: v_lshlrev_b64 v[8:9], v14, v[2:3] +; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v1, s0 -; GFX11-NEXT: v_or_b32_e32 v0, v14, v4 -; GFX11-NEXT: v_or_b32_e32 v1, v7, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v15 +; GFX11-NEXT: v_or_b32_e32 v19, v11, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v16, v10, v8 +; GFX11-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v18, v1, v19, vcc_lo +; GFX11-NEXT: v_subrev_nc_u32_e32 v17, 64, v15 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v15, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v16, v0, v16, vcc_lo +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v15 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v15 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v17, v[6:7] +; GFX11-NEXT: v_lshrrev_b64 v[6:7], v15, v[6:7] +; GFX11-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v16, v2, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v18, v3, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX11-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v8, 0, v13, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v5, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, v7, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) ret i128 %result @@ -6797,50 +6802,50 @@ ; ; GFX10-LABEL: v_fshr_i128_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0x7f, v0 +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX10-NEXT: s_mov_b32 s9, 0 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX10-NEXT: s_lshr_b32 s8, s1, 31 -; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v5 +; GFX10-NEXT: v_and_b32_e32 v7, 0x7f, v0 +; GFX10-NEXT: v_lshrrev_b64 v[3:4], v5, s[4:5] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] -; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], v13, s[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[6:7], v12, s[0:1] -; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 -; GFX10-NEXT: v_or_b32_e32 v4, v4, v8 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v1 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX10-NEXT: v_or_b32_e32 v5, v5, v9 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, s8, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, s9, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 -; GFX10-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX10-NEXT: v_lshlrev_b64 v[1:2], v1, s[6:7] +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 64, v7 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v7 +; GFX10-NEXT: v_or_b32_e32 v6, v3, v1 +; GFX10-NEXT: v_or_b32_e32 v4, v4, v2 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v7, s[2:3] +; GFX10-NEXT: v_or_b32_e32 v8, v0, v2 +; GFX10-NEXT: v_or_b32_e32 v9, v1, v3 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v7, s[0:1] +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v7 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 64, v5 +; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v9, v3, v9, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v0, s[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v5, s[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, s5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, s2, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v9, s3, s1 +; GFX10-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshr_i128_ssv: @@ -7094,34 +7099,34 @@ ; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX10-NEXT: s_sub_i32 s0, 64, s8 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] -; GFX10-NEXT: s_sub_i32 s0, s8, 64 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX10-NEXT: s_sub_i32 s2, 64, s8 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s2, v[2:3] +; GFX10-NEXT: s_sub_i32 s2, s8, 64 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_cselect_b32 s3, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 -; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX10-NEXT: v_or_b32_e32 v6, v4, v6 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: s_and_b32 s0, 1, s1 -; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_and_b32 s0, 1, s6 -; GFX10-NEXT: s_and_b32 s1, 1, s1 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], s8, v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 -; GFX10-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_or_b32_e32 v1, s5, v1 -; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 -; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX10-NEXT: s_and_b32 s7, 1, s3 +; GFX10-NEXT: v_or_b32_e32 v7, v5, v7 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s2, v[2:3] +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s7 +; GFX10-NEXT: s_and_b32 s2, 1, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10-NEXT: s_and_b32 s2, 1, s3 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s8, v[2:3] +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v0, s4, v4 +; GFX10-NEXT: v_or_b32_e32 v1, s5, v5 +; GFX10-NEXT: v_or_b32_e32 v2, s0, v2 +; GFX10-NEXT: v_or_b32_e32 v3, s1, v3 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshr_i128_svs: @@ -7355,50 +7360,50 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 31, v1 ; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] -; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX10-NEXT: s_andn2_b64 s[8:9], s[6:7], s[4:5] +; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] ; GFX10-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX10-NEXT: s_sub_i32 s6, 64, s4 -; GFX10-NEXT: s_sub_i32 s5, s4, 64 -; GFX10-NEXT: s_cmp_lt_u32 s4, 64 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] +; GFX10-NEXT: s_sub_i32 s9, 64, s8 +; GFX10-NEXT: s_sub_i32 s4, s8, 64 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s9, v[0:1] +; GFX10-NEXT: s_cmp_lt_u32 s8, 64 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: s_cselect_b32 s7, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], s4, v[0:1] -; GFX10-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10-NEXT: s_and_b32 s5, 1, s5 +; GFX10-NEXT: v_or_b32_e32 v6, v4, v6 +; GFX10-NEXT: v_or_b32_e32 v7, v5, v7 +; GFX10-NEXT: v_lshlrev_b64 v[4:5], s8, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 ; GFX10-NEXT: s_and_b32 s4, 1, s7 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 -; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX10-NEXT: s_and_b32 s4, 1, s9 -; GFX10-NEXT: s_sub_i32 s10, s8, 64 -; GFX10-NEXT: s_sub_i32 s6, 64, s8 -; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo +; GFX10-NEXT: s_sub_i32 s10, s6, 64 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 +; GFX10-NEXT: s_sub_i32 s7, 64, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX10-NEXT: s_cmp_lt_u32 s6, 64 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX10-NEXT: s_cmp_eq_u32 s6, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, v3, s4 ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s8 -; GFX10-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[2:3], s8 -; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s6 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s7 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[2:3], s6 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], 0 -; GFX10-NEXT: v_or_b32_e32 v1, s1, v7 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v4 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX10-NEXT: ; return to shader part epilog @@ -7563,9 +7568,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[6:7] ; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 31, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 31, v8 -; GFX10-NEXT: v_or_b32_e32 v1, v9, v5 +; GFX10-NEXT: v_or_b32_e32 v1, v6, v5 ; GFX10-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -8315,92 +8320,92 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v17, -1, v16 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_and_b32_e32 v26, 0x7f, v16 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v17 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: v_subrev_nc_u32_e32 v27, 64, v26 -; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v26 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v25 +; GFX10-NEXT: v_and_b32_e32 v24, 0x7f, v16 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX10-NEXT: v_or_b32_e32 v2, v2, v17 -; GFX10-NEXT: v_subrev_nc_u32_e32 v19, 64, v25 -; GFX10-NEXT: v_lshlrev_b64 v[23:24], v25, v[0:1] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 -; GFX10-NEXT: v_lshrrev_b64 v[17:18], v18, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[21:22], v25, v[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1] -; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v23, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v24, 0, v24, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v22, v18, v22 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v26 +; GFX10-NEXT: v_xor_b32_e32 v17, -1, v16 +; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v24 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v24 +; GFX10-NEXT: v_and_b32_e32 v23, 0x7f, v17 +; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v23 +; GFX10-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v23 +; GFX10-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX10-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9] -; GFX10-NEXT: v_cndmask_b32_e32 v22, v1, v22, vcc_lo -; GFX10-NEXT: v_lshlrev_b64 v[18:19], v18, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11] -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v25 -; GFX10-NEXT: v_or_b32_e32 v16, v16, v18 -; GFX10-NEXT: v_or_b32_e32 v17, v17, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v18, v21, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v22, v22, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v16, s4 -; GFX10-NEXT: v_xor_b32_e32 v16, -1, v20 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v17, s4 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v16 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 31, v5 +; GFX10-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] +; GFX10-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] +; GFX10-NEXT: v_or_b32_e32 v25, v18, v16 +; GFX10-NEXT: v_lshrrev_b32_e32 v16, 31, v5 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v0, v23, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v25 -; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 -; GFX10-NEXT: v_and_b32_e32 v23, 0x7f, v20 +; GFX10-NEXT: v_or_b32_e32 v26, v19, v17 +; GFX10-NEXT: v_or_b32_e32 v6, v6, v16 +; GFX10-NEXT: v_xor_b32_e32 v16, -1, v20 +; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v20 +; GFX10-NEXT: v_and_b32_e32 v27, 0x7f, v16 +; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 0, v20 +; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v27 +; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[6:7] +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v27 +; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v28, v16, v18 +; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v20 +; GFX10-NEXT: v_or_b32_e32 v29, v17, v19 +; GFX10-NEXT: v_lshrrev_b64 v[18:19], v20, v[12:13] +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[14:15] +; GFX10-NEXT: v_or_b32_e32 v18, v18, v16 +; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v23 +; GFX10-NEXT: v_or_b32_e32 v19, v19, v17 +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v21, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v22, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v2, s4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v24 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v3, s4 +; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v24 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v25, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v26, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v2, v8, s5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v27 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v3, v9, s5 +; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v27 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[4:5], v27, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v28, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v29, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v2, v6, s6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v20 +; GFX10-NEXT: v_cndmask_b32_e64 v22, v3, v7, s6 +; GFX10-NEXT: v_cmp_gt_u32_e64 s6, 64, v20 +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v20, v[14:15] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[14:15] +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v6, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v7, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v18, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v19, s6 +; GFX10-NEXT: v_or_b32_e32 v6, v21, v6 +; GFX10-NEXT: v_or_b32_e32 v7, v22, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v2, v12, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v3, v13, s7 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v24, v[10:11] +; GFX10-NEXT: v_or_b32_e32 v4, v4, v12 +; GFX10-NEXT: v_or_b32_e32 v5, v5, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, v3, s4 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v9, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[6:7] -; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v23 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v25 -; GFX10-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v25, v[4:5] -; GFX10-NEXT: v_lshrrev_b64 v[18:19], v23, v[12:13] -; GFX10-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v23 -; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 -; GFX10-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] -; GFX10-NEXT: v_or_b32_e32 v5, v9, v11 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v8, v[14:15] -; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v23 -; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v16, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v16, v18, v20 -; GFX10-NEXT: v_or_b32_e32 v18, v19, v21 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v3, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[3:4], v23, v[14:15] -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v16, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v23 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v25 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v18, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v6, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v5, v7, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v12, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v9, v13, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v4, s4 -; GFX10-NEXT: v_or_b32_e32 v3, v22, v26 -; GFX10-NEXT: v_or_b32_e32 v4, v11, v5 -; GFX10-NEXT: v_or_b32_e32 v5, v14, v8 -; GFX10-NEXT: v_or_b32_e32 v6, v6, v9 -; GFX10-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s4 +; GFX10-NEXT: v_or_b32_e32 v2, v16, v2 +; GFX10-NEXT: v_or_b32_e32 v3, v17, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_v2i128: @@ -8409,100 +8414,110 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_xor_b32_e32 v17, -1, v16 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v25, 0x7f, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 31, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 31, v1 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_or_b32_e32 v2, v2, v17 -; GFX11-NEXT: v_lshlrev_b64 v[23:24], v25, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_cndmask_b32 v23, 0, v23 :: v_dual_and_b32 v26, 0x7f, v16 -; GFX11-NEXT: v_cndmask_b32_e32 v24, 0, v24, vcc_lo -; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v25 -; GFX11-NEXT: v_lshlrev_b64 v[21:22], v25, v[2:3] -; GFX11-NEXT: v_subrev_nc_u32_e32 v19, 64, v25 -; GFX11-NEXT: v_subrev_nc_u32_e32 v27, 64, v26 -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v26 -; GFX11-NEXT: v_lshrrev_b64 v[17:18], v18, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] +; GFX11-NEXT: v_and_b32_e32 v23, 0x7f, v17 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1] +; GFX11-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-NEXT: v_sub_nc_u32_e32 v17, 64, v23 +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] +; GFX11-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v22, v18, v22 -; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX11-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX11-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9] -; GFX11-NEXT: v_cndmask_b32_e32 v22, v1, v22, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11] -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v16, v16, v18 -; GFX11-NEXT: v_or_b32_e32 v17, v17, v19 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v16, s0 -; GFX11-NEXT: v_xor_b32_e32 v16, -1, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v18, v21, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v17, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: v_and_b32_e32 v25, 0x7f, v16 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11] -; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v9 :: v_dual_cndmask_b32 v0, v0, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 31, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v24 +; GFX11-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] +; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v25, v18, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 31, v5 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] -; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v25 -; GFX11-NEXT: v_cndmask_b32_e64 v26, 0, v3, s0 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v25 -; GFX11-NEXT: v_or_b32_e32 v6, v6, v8 -; GFX11-NEXT: v_or_b32_e32 v0, v23, v0 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v9, v[4:5] -; GFX11-NEXT: v_lshlrev_b64 v[16:17], v25, v[4:5] -; GFX11-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] -; GFX11-NEXT: v_lshlrev_b64 v[10:11], v25, v[6:7] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 +; GFX11-NEXT: v_or_b32_e32 v26, v19, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v6, v6, v16 +; GFX11-NEXT: v_xor_b32_e32 v16, -1, v20 +; GFX11-NEXT: v_and_b32_e32 v20, 0x7f, v20 +; GFX11-NEXT: v_and_b32_e32 v27, 0x7f, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v27 +; GFX11-NEXT: v_lshlrev_b64 v[18:19], v27, v[6:7] +; GFX11-NEXT: v_lshrrev_b64 v[16:17], v16, v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v28, v16, v18 +; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v20 +; GFX11-NEXT: v_or_b32_e32 v29, v17, v19 +; GFX11-NEXT: v_lshrrev_b64 v[18:19], v20, v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[14:15] +; GFX11-NEXT: v_or_b32_e32 v18, v18, v16 +; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 64, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v19, v19, v17 +; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v21, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v22, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v24 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27 +; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v2, s0 +; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 64, v24 +; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v3, s0 +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v24 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v25, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v26, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v2, v8, s1 +; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 64, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v9, v3, v9, s1 +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v27 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[4:5], v27, v[4:5] +; GFX11-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v20, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e64 v22, v2, v28, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v23, v3, v29, s1 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v24, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v21, 0, v4, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, v5, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v6, v22, v6 :: v_dual_cndmask_b32 v7, v23, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v3, s0 +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v20 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v25 -; GFX11-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX11-NEXT: v_and_b32_e32 v23, 0x7f, v20 -; GFX11-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX11-NEXT: v_or_b32_e32 v5, v9, v11 +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b64 v[3:4], v3, v[14:15] +; GFX11-NEXT: v_or_b32_e32 v2, v16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v18, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v19, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v11, 0, v16 :: v_dual_cndmask_b32 v10, v3, v10 -; GFX11-NEXT: v_sub_nc_u32_e32 v20, 64, v23 -; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v23 -; GFX11-NEXT: v_lshrrev_b64 v[18:19], v23, v[12:13] -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v23 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo -; GFX11-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v8, v[14:15] -; GFX11-NEXT: v_lshrrev_b64 v[3:4], v23, v[14:15] -; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v23 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v10, v6, s2 -; GFX11-NEXT: v_or_b32_e32 v16, v18, v20 -; GFX11-NEXT: v_or_b32_e32 v18, v19, v21 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v4, s0 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v9 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v16, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v18, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v3, v12, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v4, v13, s2 +; GFX11-NEXT: v_or_b32_e32 v3, v17, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_or_b32_e32 v7, v7, v10 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v8, v12, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v8, v9, v13, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v3, s0 -; GFX11-NEXT: v_or_b32_e32 v3, v22, v26 -; GFX11-NEXT: v_or_b32_e32 v4, v11, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v5, v14, v8 -; GFX11-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-NEXT: v_or_b32_e32 v4, v21, v11 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v12 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) ret <2 x i128> %result Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -511,17 +511,17 @@ ; ; GFX10-LABEL: insertelement_v_v2i16_s_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX10-NEXT: s_and_b32 s0, s2, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, v0, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; @@ -691,16 +691,16 @@ ; ; GFX10-LABEL: insertelement_v_v2i16_v_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, v0, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; @@ -869,11 +869,11 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v2, s2, s1 +; GFX10-NEXT: v_and_or_b32 v2, v2, s2, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -988,7 +988,7 @@ ; GFX10-NEXT: s_lshr_b32 s2, s4, 1 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX10-NEXT: s_cmp_eq_u32 s2, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s3, s1, s0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 @@ -996,13 +996,13 @@ ; GFX10-NEXT: s_lshl_b32 s4, s4, 4 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 0 ; GFX10-NEXT: s_andn2_b32 s3, s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v4, v2, s4, s3 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, s4, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -1124,24 +1124,24 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v0 ; GFX10-NEXT: s_and_b32 s2, s4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s2 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v0, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 -; GFX10-NEXT: v_and_or_b32 v5, v5, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -1262,23 +1262,23 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v2, 0xffff +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v2, v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 -; GFX10-NEXT: v_and_or_b32 v5, v5, v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -1390,21 +1390,21 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v3, 1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX10-NEXT: s_and_b32 s0, s2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v3, v5, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -1519,12 +1519,12 @@ ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v3, s0, v2 +; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -1631,20 +1631,20 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v4, 1, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v4, 0xffff +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v5, v4, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -1970,14 +1970,14 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v3, s1 -; GFX10-NEXT: v_and_or_b32 v6, v4, s3, s2 +; GFX10-NEXT: v_and_or_b32 v4, v4, s3, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; @@ -2129,33 +2129,33 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s5, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_cmp_eq_u32 s5, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s6, s1, s0 ; GFX10-NEXT: s_cmp_eq_u32 s5, 2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s6, s2, s6 ; GFX10-NEXT: s_cmp_eq_u32 s5, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_cselect_b32 s6, s3, s6 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: s_lshl_b32 s4, s4, 4 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_lshl_b32 s7, 0xffff, s4 ; GFX10-NEXT: s_andn2_b32 s6, s6, s7 -; GFX10-NEXT: v_lshl_or_b32 v6, v4, s4, s6 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX10-NEXT: v_lshl_or_b32 v4, v0, s4, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; @@ -2317,33 +2317,33 @@ ; GFX10-LABEL: insertelement_s_v8i16_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 ; GFX10-NEXT: s_and_b32 s1, s4, 0xffff -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s9 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s11, s1 +; GFX10-NEXT: v_and_or_b32 v5, v0, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; @@ -2502,33 +2502,33 @@ ; GFX10-LABEL: insertelement_s_v8i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, 0xffff +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s7, s1 +; GFX10-NEXT: v_and_or_b32 v5, v1, v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; @@ -2667,30 +2667,30 @@ ; ; GFX10-LABEL: insertelement_v_v8i16_s_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v2 ; GFX10-NEXT: s_and_b32 s1, s2, 0xffff -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v3 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v5, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v5, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v3, s1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v7, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1 -; GFX10-NEXT: v_and_or_b32 v9, v2, v7, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, 0 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 -; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v8i16_s_v: @@ -2829,20 +2829,20 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s2 -; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_not_b32 s2, s2 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 -; GFX10-NEXT: v_and_or_b32 v9, v0, s2, v1 +; GFX10-NEXT: v_and_or_b32 v7, v0, s2, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 -; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v7, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v7, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v8i16_v_s: @@ -2973,29 +2973,29 @@ ; ; GFX10-LABEL: insertelement_v_v8i16_v_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v3 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 1, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v4, 0xffff +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 -; GFX10-NEXT: v_and_or_b32 v3, v3, v2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1 -; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v3, s1 +; GFX10-NEXT: v_and_or_b32 v4, v7, v4, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v8i16_v_v: @@ -3163,10 +3163,8 @@ ; GFX10-NEXT: s_and_b32 s1, s4, 0xffff ; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s0 ; GFX10-NEXT: s_lshl_b32 s0, s1, s0 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_mov_b32_e32 v10, 16 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_movrels_b32 s3, s8 ; GFX10-NEXT: s_andn2_b32 s1, s3, s2 @@ -3176,12 +3174,14 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_mov_b32_e32 v4, s12 -; GFX10-NEXT: v_mov_b32_e32 v5, s13 -; GFX10-NEXT: v_mov_b32_e32 v6, s14 -; GFX10-NEXT: v_mov_b32_e32 v7, s15 -; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: v_mov_b32_e32 v4, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: v_mov_b32_e32 v2, s14 +; GFX10-NEXT: v_mov_b32_e32 v3, s15 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v16i16_s_s: @@ -3328,16 +3328,16 @@ ; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s0 ; GFX10-NEXT: s_lshl_b32 s0, s1, s0 ; GFX10-NEXT: s_not_b32 s1, s2 -; GFX10-NEXT: v_mov_b32_e32 v10, 16 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_movrels_b32_e32 v0, v2 -; GFX10-NEXT: v_and_or_b32 v12, v0, s1, s0 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, s0 +; GFX10-NEXT: v_movreld_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_movreld_b32_e32 v2, v12 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX10-NEXT: global_store_dwordx4 v[10:11], v[6:9], off +; GFX10-NEXT: v_mov_b32_e32 v0, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[6:9], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v16i16_s_s: @@ -3497,27 +3497,27 @@ ; GFX10-NEXT: s_and_b32 s0, s4, 1 ; GFX10-NEXT: s_lshr_b32 m0, s4, 1 ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_lshl_b32 s1, 0xffff, s0 -; GFX10-NEXT: v_mov_b32_e32 v10, 16 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_movrels_b32 s2, s8 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: s_andn2_b32 s1, s2, s1 +; GFX10-NEXT: v_lshl_or_b32 v8, v0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_lshl_or_b32 v12, v8, s0, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_mov_b32_e32 v4, s12 ; GFX10-NEXT: v_mov_b32_e32 v5, s13 ; GFX10-NEXT: v_mov_b32_e32 v6, s14 ; GFX10-NEXT: v_mov_b32_e32 v7, s15 +; GFX10-NEXT: v_movreld_b32_e32 v0, v8 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_movreld_b32_e32 v0, v12 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX10-NEXT: v_mov_b32_e32 v0, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v16i16_v_s: @@ -3738,77 +3738,77 @@ ; GFX10-LABEL: insertelement_s_v16i16_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s5, s4, 0xffff -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v8 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s5 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s15, s5 +; GFX10-NEXT: v_and_or_b32 v10, v1, v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8 ; GFX10-NEXT: v_mov_b32_e32 v4, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v10, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v10, s1 ; GFX10-NEXT: v_mov_b32_e32 v5, s13 ; GFX10-NEXT: v_mov_b32_e32 v6, s14 ; GFX10-NEXT: v_mov_b32_e32 v7, s15 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_mov_b32_e32 v10, 16 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s2 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX10-NEXT: v_mov_b32_e32 v0, 16 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v10, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v10, s5 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v16i16_s_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b256 s[8:15], s[2:3], 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 1, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 1, v0 ; GFX11-NEXT: s_and_b32 s5, s4, 0xffff ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v8 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v8 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v8 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v8 +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v8 +; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v8 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e64 v8, v0, s5 +; GFX11-NEXT: v_lshlrev_b32_e64 v9, v0, s5 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s9 -; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 -; GFX11-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v8 +; GFX11-NEXT: v_xor_b32_e32 v10, -1, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 @@ -3818,24 +3818,26 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v11, v1, s15, s5 ; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 -; GFX11-NEXT: v_and_or_b32 v13, v10, v9, v8 ; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 ; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 +; GFX11-NEXT: v_and_or_b32 v9, v11, v10, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, 16 ; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v9, s6 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v9, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v9, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v9, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v9, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v9, s5 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_mov_b32_e32 v9, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off @@ -4030,78 +4032,77 @@ ; GFX10-LABEL: insertelement_s_v16i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v8 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s15, s5 +; GFX10-NEXT: v_and_or_b32 v10, v2, v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8 ; GFX10-NEXT: v_mov_b32_e32 v4, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v10, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v10, s1 ; GFX10-NEXT: v_mov_b32_e32 v5, s13 ; GFX10-NEXT: v_mov_b32_e32 v6, s14 ; GFX10-NEXT: v_mov_b32_e32 v7, s15 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_mov_b32_e32 v10, 16 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s2 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX10-NEXT: v_mov_b32_e32 v0, 16 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v10, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v10, s5 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v16i16_v_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b256 s[8:15], s[2:3], 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 1, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 1, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v8 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v8 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v8 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v8 +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v8 +; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v8 +; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v8 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s9 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, v1, v0 -; GFX11-NEXT: v_xor_b32_e32 v9, -1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_lshlrev_b32 v9, v1, v0 +; GFX11-NEXT: v_xor_b32_e32 v10, -1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4111,25 +4112,25 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v5, s13 -; GFX11-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10 -; GFX11-NEXT: v_mov_b32_e32 v7, s15 -; GFX11-NEXT: v_mov_b32_e32 v3, s11 -; GFX11-NEXT: v_and_or_b32 v13, v10, v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v4, s12 -; GFX11-NEXT: v_mov_b32_e32 v6, s14 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v2, s15, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 +; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-NEXT: v_and_or_b32 v9, v11, v10, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, 16 +; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v9, s6 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v9, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v9, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v9, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v9, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v9, s5 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: v_mov_b32_e32 v9, 0 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v13 :: v_dual_mov_b32 v10, 16 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off @@ -4289,45 +4290,45 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 1, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX10-NEXT: s_and_b32 s5, s2, 0xffff -; GFX10-NEXT: v_mov_b32_e32 v13, 16 -; GFX10-NEXT: v_mov_b32_e32 v14, 0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 4, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s5 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 4, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v11 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v11 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5 -; GFX10-NEXT: v_and_or_b32 v15, v1, v11, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_mov_b32_e32 v12, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v15, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v15, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v15, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v15, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v15, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v15, s5 -; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v[13:14], v[4:7], off +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v10, s5 +; GFX10-NEXT: v_and_or_b32 v12, v0, v2, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v12, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v12, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v12, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v12, s2 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: v_mov_b32_e32 v0, 16 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v12, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v12, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v12, s5 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v16i16_s_v: @@ -4337,17 +4338,17 @@ ; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off offset:16 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX11-NEXT: s_and_b32 s5, s2, 0xffff -; GFX11-NEXT: v_dual_mov_b32 v13, 16 :: v_dual_and_b32 v2, 1, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 4, v0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e64 v11, v2, 0xffff ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v2, s5 ; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 @@ -4364,21 +4365,21 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v15, v1, v11, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v11, v1, v11, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v11, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v11, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v11, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v8, v11, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v11, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_cndmask_b32 v1, v4, v11 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v7, v11, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v10, v11, s5 +; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 ; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: v_mov_b32_e32 v12, 0 -; GFX11-NEXT: v_dual_mov_b32 v14, 0 :: v_dual_cndmask_b32 v1, v4, v15 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v15, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v15, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v15, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v7, v15, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v8, v15, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v15, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v10, v15, s5 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[11:12], v[0:3], off -; GFX11-NEXT: global_store_b128 v[13:14], v[4:7], off +; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(1) %ptr @@ -4491,19 +4492,19 @@ ; GFX10-NEXT: s_and_b32 s0, s2, 1 ; GFX10-NEXT: s_lshr_b32 m0, s2, 1 ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: v_mov_b32_e32 v11, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX10-NEXT: v_mov_b32_e32 v12, 0 -; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: s_lshl_b32 s1, 0xffff, s0 +; GFX10-NEXT: s_not_b32 s0, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_movrels_b32_e32 v1, v3 -; GFX10-NEXT: v_and_or_b32 v2, v1, s0, v0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s0, v0 +; GFX10-NEXT: v_movreld_b32_e32 v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_movreld_b32_e32 v3, v2 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[3:6], off -; GFX10-NEXT: global_store_dwordx4 v[11:12], v[7:10], off +; GFX10-NEXT: v_mov_b32_e32 v0, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v16i16_v_s: @@ -4684,44 +4685,44 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_mov_b32_e32 v14, 16 -; GFX10-NEXT: v_mov_b32_e32 v15, 0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v12, v3, 0xffff -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v12 -; GFX10-NEXT: v_mov_b32_e32 v12, 0 -; GFX10-NEXT: v_mov_b32_e32 v13, 0 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, 0xffff +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v11, s5 -; GFX10-NEXT: v_and_or_b32 v16, v1, v3, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v16, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v16, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v16, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v16, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v16, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v16, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v16, s5 -; GFX10-NEXT: global_store_dwordx4 v[12:13], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v[14:15], v[4:7], off +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v10, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s5 +; GFX10-NEXT: v_and_or_b32 v13, v0, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v13, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v13, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v13, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v13, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v13, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v13, s5 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: v_mov_b32_e32 v0, 16 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v13, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v13, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v16i16_v_v: @@ -4730,8 +4731,8 @@ ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off ; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:16 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX11-NEXT: v_dual_mov_b32 v14, 16 :: v_dual_and_b32 v3, 1, v3 -; GFX11-NEXT: v_dual_mov_b32 v15, 0 :: v_dual_and_b32 v2, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 @@ -4742,12 +4743,10 @@ ; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e64 v12, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v12 -; GFX11-NEXT: v_mov_b32_e32 v12, 0 -; GFX11-NEXT: v_mov_b32_e32 v13, 0 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v3, 0xffff +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4761,18 +4760,21 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v11, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v16, v1, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v16, s6 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v16, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v16, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v16, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v16, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v10, v16, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v11, v16, s5 +; GFX11-NEXT: v_and_or_b32 v12, v1, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v12, s6 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v12, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v12, s3 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v12, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v12, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v10, v12, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v11, v12, s5 +; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 +; GFX11-NEXT: v_mov_b32_e32 v11, 0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[12:13], v[0:3], off -; GFX11-NEXT: global_store_b128 v[14:15], v[4:7], off +; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(1) %ptr Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -1309,17 +1309,17 @@ ; ; GFX10-LABEL: insertelement_v_v4i8_s_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v0, 3, v2 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_and_b32_e32 v1, 3, v2 ; GFX10-NEXT: s_and_b32 s0, s2, 0xff -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, v0, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; @@ -1493,16 +1493,16 @@ ; ; GFX10-LABEL: insertelement_v_v4i8_v_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v0, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_and_b32_e32 v1, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xff +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, v0, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; @@ -1750,11 +1750,11 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v2, s2, s1 +; GFX10-NEXT: v_and_or_b32 v2, v2, s2, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -1870,7 +1870,7 @@ ; GFX10-NEXT: s_lshr_b32 s2, s4, 2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX10-NEXT: s_cmp_eq_u32 s2, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s3, s1, s0 ; GFX10-NEXT: s_and_b32 s4, s4, 3 @@ -1878,13 +1878,13 @@ ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_lshl_b32 s5, 0xff, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 0 ; GFX10-NEXT: s_andn2_b32 s3, s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v4, v2, s4, s3 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, s4, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -2007,24 +2007,24 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v0 ; GFX10-NEXT: s_and_b32 s2, s4, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s2 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v0, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 -; GFX10-NEXT: v_and_or_b32 v5, v5, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -2146,23 +2146,23 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v2, 0xff +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v2, v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 -; GFX10-NEXT: v_and_or_b32 v5, v5, v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -2276,21 +2276,21 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v3, 3, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 2, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; GFX10-NEXT: s_and_b32 s0, s2, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xff -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v3, v5, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -2407,12 +2407,12 @@ ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v3, s0, v2 +; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -2521,20 +2521,20 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v4, 3, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 2, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xff +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v4, 0xff +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v5, v4, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -2860,14 +2860,14 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v3, s1 -; GFX10-NEXT: v_and_or_b32 v6, v4, s3, s2 +; GFX10-NEXT: v_and_or_b32 v4, v4, s3, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; @@ -3019,33 +3019,33 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s5, s4, 2 -; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: s_cmp_eq_u32 s5, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s6, s1, s0 ; GFX10-NEXT: s_cmp_eq_u32 s5, 2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s6, s2, s6 ; GFX10-NEXT: s_cmp_eq_u32 s5, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_cselect_b32 s6, s3, s6 ; GFX10-NEXT: s_and_b32 s4, s4, 3 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_lshl_b32 s7, 0xff, s4 ; GFX10-NEXT: s_andn2_b32 s6, s6, s7 -; GFX10-NEXT: v_lshl_or_b32 v6, v4, s4, s6 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX10-NEXT: v_lshl_or_b32 v4, v0, s4, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; @@ -3207,33 +3207,33 @@ ; GFX10-LABEL: insertelement_s_v16i8_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 2, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v0 ; GFX10-NEXT: s_and_b32 s1, s4, 0xff -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s9 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s11, s1 +; GFX10-NEXT: v_and_or_b32 v5, v0, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; @@ -3392,33 +3392,33 @@ ; GFX10-LABEL: insertelement_s_v16i8_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 2, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v1 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, 0xff +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s7, s1 +; GFX10-NEXT: v_and_or_b32 v5, v1, v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; @@ -3557,30 +3557,30 @@ ; ; GFX10-LABEL: insertelement_v_v16i8_s_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 3, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v2 ; GFX10-NEXT: s_and_b32 s1, s2, 0xff -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, 0xff -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v5, 0xff +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v5, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v3, s1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v7, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1 -; GFX10-NEXT: v_and_or_b32 v9, v2, v7, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, 0 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 -; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v16i8_s_v: @@ -3719,20 +3719,20 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_lshl_b32 s2, 0xff, s2 -; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_not_b32 s2, s2 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 -; GFX10-NEXT: v_and_or_b32 v9, v0, s2, v1 +; GFX10-NEXT: v_and_or_b32 v7, v0, s2, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 -; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v7, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v7, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v16i8_v_s: @@ -3863,29 +3863,29 @@ ; ; GFX10-LABEL: insertelement_v_v16i8_v_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v3 -; GFX10-NEXT: v_and_b32_e32 v0, 3, v3 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, 0xff -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 2, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v4, 0xff +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 -; GFX10-NEXT: v_and_or_b32 v3, v3, v2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1 -; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v3, s1 +; GFX10-NEXT: v_and_or_b32 v4, v7, v4, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v16i8_v_v: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -66,40 +66,54 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0xf -; GFX10-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] -; GFX10-NEXT: global_load_dwordx4 v[8:11], v64, s[0:1] offset:16 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v64, s[0:1] offset:32 -; GFX10-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:48 -; GFX10-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:64 -; GFX10-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:80 -; GFX10-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:96 -; GFX10-NEXT: global_load_dwordx4 v[32:35], v64, s[0:1] offset:112 -; GFX10-NEXT: global_load_dwordx4 v[36:39], v64, s[0:1] offset:160 -; GFX10-NEXT: global_load_dwordx4 v[40:43], v64, s[0:1] offset:176 -; GFX10-NEXT: global_load_dwordx4 v[44:47], v64, s[0:1] offset:192 -; GFX10-NEXT: global_load_dwordx4 v[48:51], v64, s[0:1] offset:208 -; GFX10-NEXT: global_load_dwordx4 v[52:55], v64, s[0:1] offset:224 -; GFX10-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:240 -; GFX10-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:128 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v64, s[0:1] offset:144 -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dwordx4 v[8:11], v64, s[0:1] +; GFX10-NEXT: global_load_dwordx4 v[12:15], v64, s[0:1] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:80 +; GFX10-NEXT: global_load_dwordx4 v[32:35], v64, s[0:1] offset:96 +; GFX10-NEXT: global_load_dwordx4 v[36:39], v64, s[0:1] offset:112 +; GFX10-NEXT: global_load_dwordx4 v[40:43], v64, s[0:1] offset:160 +; GFX10-NEXT: global_load_dwordx4 v[44:47], v64, s[0:1] offset:176 +; GFX10-NEXT: global_load_dwordx4 v[48:51], v64, s[0:1] offset:192 +; GFX10-NEXT: global_load_dwordx4 v[52:55], v64, s[0:1] offset:208 +; GFX10-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:224 +; GFX10-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:240 +; GFX10-NEXT: s_waitcnt vmcnt(14) ; GFX10-NEXT: v_mov_b32_e32 v5, 0x3e7 -; GFX10-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] -; GFX10-NEXT: global_store_dwordx4 v64, v[8:11], s[2:3] offset:16 -; GFX10-NEXT: global_store_dwordx4 v64, v[12:15], s[2:3] offset:32 -; GFX10-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:48 -; GFX10-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:64 -; GFX10-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:80 -; GFX10-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:96 -; GFX10-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] offset:112 -; GFX10-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:128 ; GFX10-NEXT: global_store_dwordx4 v64, v[4:7], s[2:3] offset:144 -; GFX10-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:160 -; GFX10-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:176 -; GFX10-NEXT: global_store_dwordx4 v64, v[44:47], s[2:3] offset:192 -; GFX10-NEXT: global_store_dwordx4 v64, v[48:51], s[2:3] offset:208 -; GFX10-NEXT: global_store_dwordx4 v64, v[52:55], s[2:3] offset:224 -; GFX10-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:240 +; GFX10-NEXT: s_waitcnt vmcnt(13) +; GFX10-NEXT: global_store_dwordx4 v64, v[8:11], s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(12) +; GFX10-NEXT: global_store_dwordx4 v64, v[12:15], s[2:3] offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(11) +; GFX10-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:32 +; GFX10-NEXT: s_waitcnt vmcnt(10) +; GFX10-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:48 +; GFX10-NEXT: s_waitcnt vmcnt(9) +; GFX10-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:64 +; GFX10-NEXT: s_waitcnt vmcnt(8) +; GFX10-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:80 +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] offset:96 +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:112 +; GFX10-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128 +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:160 +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: global_store_dwordx4 v64, v[44:47], s[2:3] offset:176 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: global_store_dwordx4 v64, v[48:51], s[2:3] offset:192 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: global_store_dwordx4 v64, v[52:55], s[2:3] offset:208 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:224 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:240 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_insert_v64i32_37: @@ -107,36 +121,44 @@ ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] -; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] offset:16 -; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:32 -; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:48 -; GFX11-NEXT: global_load_b128 v[20:23], v64, s[0:1] offset:64 -; GFX11-NEXT: global_load_b128 v[24:27], v64, s[0:1] offset:80 -; GFX11-NEXT: global_load_b128 v[28:31], v64, s[0:1] offset:96 -; GFX11-NEXT: global_load_b128 v[32:35], v64, s[0:1] offset:112 -; GFX11-NEXT: global_load_b128 v[36:39], v64, s[0:1] offset:128 ; GFX11-NEXT: global_load_b128 v[4:7], v64, s[0:1] offset:144 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, 0x3e7 +; GFX11-NEXT: s_clause 0xe +; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:128 +; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] +; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:16 +; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:32 +; GFX11-NEXT: global_load_b128 v[20:23], v64, s[0:1] offset:48 +; GFX11-NEXT: global_load_b128 v[24:27], v64, s[0:1] offset:64 +; GFX11-NEXT: global_load_b128 v[28:31], v64, s[0:1] offset:80 +; GFX11-NEXT: global_load_b128 v[32:35], v64, s[0:1] offset:96 +; GFX11-NEXT: global_load_b128 v[36:39], v64, s[0:1] offset:112 ; GFX11-NEXT: global_load_b128 v[40:43], v64, s[0:1] offset:160 ; GFX11-NEXT: global_load_b128 v[44:47], v64, s[0:1] offset:176 ; GFX11-NEXT: global_load_b128 v[48:51], v64, s[0:1] offset:192 ; GFX11-NEXT: global_load_b128 v[52:55], v64, s[0:1] offset:208 ; GFX11-NEXT: global_load_b128 v[56:59], v64, s[0:1] offset:224 ; GFX11-NEXT: global_load_b128 v[60:63], v64, s[0:1] offset:240 -; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: v_mov_b32_e32 v5, 0x3e7 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: global_store_b128 v64, v[0:3], s[2:3] -; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] offset:16 -; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:32 -; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:48 -; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:64 -; GFX11-NEXT: global_store_b128 v64, v[24:27], s[2:3] offset:80 -; GFX11-NEXT: global_store_b128 v64, v[28:31], s[2:3] offset:96 -; GFX11-NEXT: global_store_b128 v64, v[32:35], s[2:3] offset:112 -; GFX11-NEXT: global_store_b128 v64, v[36:39], s[2:3] offset:128 ; GFX11-NEXT: global_store_b128 v64, v[4:7], s[2:3] offset:144 +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: global_store_b128 v64, v[0:3], s[2:3] offset:128 +; GFX11-NEXT: s_waitcnt vmcnt(13) +; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(11) +; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:32 +; GFX11-NEXT: s_waitcnt vmcnt(10) +; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:48 +; GFX11-NEXT: s_waitcnt vmcnt(9) +; GFX11-NEXT: global_store_b128 v64, v[24:27], s[2:3] offset:64 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: global_store_b128 v64, v[28:31], s[2:3] offset:80 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: global_store_b128 v64, v[32:35], s[2:3] offset:96 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: global_store_b128 v64, v[36:39], s[2:3] offset:112 ; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: global_store_b128 v64, v[40:43], s[2:3] offset:160 ; GFX11-NEXT: s_waitcnt vmcnt(4) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -130,38 +130,37 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s11, 0x41000000 -; GFX10-NEXT: s_mov_b32 s4, 1.0 ; GFX10-NEXT: s_mov_b32 s10, 0x40e00000 ; GFX10-NEXT: s_mov_b32 s9, 0x40c00000 ; GFX10-NEXT: s_mov_b32 s8, 0x40a00000 ; GFX10-NEXT: s_mov_b32 s7, 4.0 ; GFX10-NEXT: s_mov_b32 s6, 0x40400000 ; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v15, s11 -; GFX10-NEXT: v_mov_b32_e32 v8, s4 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, s5 -; GFX10-NEXT: v_mov_b32_e32 v10, s6 -; GFX10-NEXT: v_mov_b32_e32 v11, s7 +; GFX10-NEXT: v_mov_b32_e32 v14, s10 +; GFX10-NEXT: v_mov_b32_e32 v13, s9 ; GFX10-NEXT: v_mov_b32_e32 v12, s8 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v11, s7 +; GFX10-NEXT: v_mov_b32_e32 v10, s6 +; GFX10-NEXT: v_mov_b32_e32 v9, s5 +; GFX10-NEXT: v_mov_b32_e32 v8, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s10, 0, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_mov_b32_e32 v13, s9 -; GFX10-NEXT: v_mov_b32_e32 v14, s10 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 3, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 5, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 6, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s9, 7, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v0, s10 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v0, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v14, v0, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v0, s9 ; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -253,31 +252,30 @@ ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: v_mov_b32_e32 v15, s7 -; GFX10-NEXT: v_mov_b32_e32 v8, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_mov_b32_e32 v9, s1 -; GFX10-NEXT: v_mov_b32_e32 v10, s2 -; GFX10-NEXT: v_mov_b32_e32 v11, s3 -; GFX10-NEXT: v_mov_b32_e32 v12, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, s10, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v8, s7 +; GFX10-NEXT: v_mov_b32_e32 v7, s6 +; GFX10-NEXT: v_mov_b32_e32 v6, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_mov_b32_e32 v4, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_mov_b32_e32 v13, s5 -; GFX10-NEXT: v_mov_b32_e32 v14, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, s10, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, s10, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, s10, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, s10, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, s10, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v14, s10, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, s10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s10, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s10, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s10, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s10, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s10, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s10, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s10, s5 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: dyn_insertelement_v8f32_s_s_v: @@ -483,30 +481,29 @@ ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: v_mov_b32_e32 v15, s7 -; GFX10-NEXT: v_mov_b32_e32 v8, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, s1 -; GFX10-NEXT: v_mov_b32_e32 v10, s2 -; GFX10-NEXT: v_mov_b32_e32 v11, s3 +; GFX10-NEXT: v_mov_b32_e32 v14, s6 +; GFX10-NEXT: v_mov_b32_e32 v13, s5 ; GFX10-NEXT: v_mov_b32_e32 v12, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v11, s3 +; GFX10-NEXT: v_mov_b32_e32 v10, s2 +; GFX10-NEXT: v_mov_b32_e32 v9, s1 +; GFX10-NEXT: v_mov_b32_e32 v8, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_mov_b32_e32 v13, s5 -; GFX10-NEXT: v_mov_b32_e32 v14, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v0, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v0, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v0, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v14, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v0, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; GFX10-NEXT: ; return to shader part epilog ; @@ -569,25 +566,45 @@ ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: dyn_insertelement_v8f32_v_s_v: -; GFX10PLUS: ; %bb.0: ; %entry -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, v4, s2, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v5, v5, s2, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v6, v6, s2, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v8 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v7, v7, s2, vcc_lo -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: dyn_insertelement_v8f32_v_s_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 4, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 6, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 7, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 0, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s2, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s2, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s2, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s2, s6 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v8f32_v_s_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v8 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s2, vcc_lo +; GFX11-NEXT: ; return to shader part epilog entry: %insert = insertelement <8 x float> %vec, float %val, i32 %idx ret <8 x float> %insert @@ -678,25 +695,45 @@ ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: dyn_insertelement_v8f32_v_v_v: -; GFX10PLUS: ; %bb.0: ; %entry -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: dyn_insertelement_v8f32_v_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v8, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v8, s5 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v8f32_v_v_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo +; GFX11-NEXT: ; return to shader part epilog entry: %insert = insertelement <8 x float> %vec, float %val, i32 %idx ret <8 x float> %insert @@ -902,21 +939,21 @@ ; GFX10-NEXT: v_mov_b32_e32 v16, s17 ; GFX10-NEXT: v_mov_b32_e32 v17, s18 ; GFX10-NEXT: v_mov_b32_e32 v18, s19 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s10, 0, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 3, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s10, 2, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 5, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 6, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s9, 7, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v0, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v0, s10 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v1, s10 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v1, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v1, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v0, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v0, s7 @@ -942,7 +979,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_mov_b32 s14, 0 ; GFX11-NEXT: s_mov_b32 s15, 0x40200000 -; GFX11-NEXT: s_mov_b64 s[0:1], 1.0 ; GFX11-NEXT: s_mov_b32 s13, 0x401c0000 ; GFX11-NEXT: s_mov_b32 s12, s14 ; GFX11-NEXT: s_mov_b32 s11, 0x40180000 @@ -953,34 +989,38 @@ ; GFX11-NEXT: s_mov_b32 s5, 0x40080000 ; GFX11-NEXT: s_mov_b32 s4, s14 ; GFX11-NEXT: s_mov_b64 s[2:3], 2.0 +; GFX11-NEXT: s_mov_b64 s[0:1], 1.0 ; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 ; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 ; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 ; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 ; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 ; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v12, v12, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s0 -; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v0 :: v_dual_cndmask_b32 v16, v16, v1 +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v0, s6 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v0 :: v_dual_cndmask_b32 v6, v6, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v1, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v1, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, v0, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v1, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s5 ; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[7:10], off dlc @@ -1093,9 +1133,6 @@ ; GFX10-NEXT: s_mov_b32 s12, s14 ; GFX10-NEXT: s_mov_b32 s14, s16 ; GFX10-NEXT: v_mov_b32_e32 v16, s15 -; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_mov_b32_e32 v15, s14 ; GFX10-NEXT: v_mov_b32_e32 v14, s13 ; GFX10-NEXT: v_mov_b32_e32 v13, s12 @@ -1109,36 +1146,39 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: v_mov_b32_e32 v4, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, s2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 4, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 5, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 6, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s18, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s19, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s18, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, s19, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, s18, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, s19, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, s18, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, s19, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, s18, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, s19, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, s18, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, s19, s4 -; GFX10-NEXT: global_store_dwordx4 v[0:1], v[1:4], off +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 0, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 6, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s18, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s19, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s18, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s19, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s18, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s19, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s18, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s19, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v9, s18, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, s19, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v11, s18, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, s19, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v13, s18, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v14, s19, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v14, v15, s18, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v16, s19, s6 +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dwordx4 v[0:1], v[5:8], off +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dwordx4 v[0:1], v[9:12], off +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dwordx4 v[0:1], v[13:16], off +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; @@ -1161,44 +1201,44 @@ ; GFX11-NEXT: s_mov_b32 s12, s14 ; GFX11-NEXT: s_mov_b32 s14, s16 ; GFX11-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v15, s14 -; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_dual_mov_b32 v14, s13 :: v_dual_mov_b32 v13, s12 ; GFX11-NEXT: v_dual_mov_b32 v12, s11 :: v_dual_mov_b32 v11, s10 ; GFX11-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v9, s8 ; GFX11-NEXT: v_dual_mov_b32 v8, s7 :: v_dual_mov_b32 v7, s6 ; GFX11-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v5, s4 ; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s18, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s19, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, s18, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s18, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s19, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, s18, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s19, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, s19, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s18, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, s19, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, s18, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, s19, vcc_lo -; GFX11-NEXT: global_store_b128 v[0:1], v[1:4], off dlc +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 0, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 6, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 7, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s18, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s19, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, s18, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v4, s19, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v5, s18, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, s19, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v7, s18, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v8, s19, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v9, s18, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v10, s19, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v11, s18, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v12, s19, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v13, s18, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v14, s19, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v14, v15, s18, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v15, v16, s19, s6 +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[5:8], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[9:12], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[13:16], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1528,21 +1568,21 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, s2 ; GFX10-NEXT: v_mov_b32_e32 v4, s1 ; GFX10-NEXT: v_mov_b32_e32 v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 2, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v0, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v1, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v0, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v1, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v0, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v0, s3 @@ -1581,33 +1621,36 @@ ; GFX11-NEXT: s_mov_b32 s12, s14 ; GFX11-NEXT: s_mov_b32 s14, s16 ; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 ; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 ; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 ; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 ; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 ; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v12, v12, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s0 -; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v0 :: v_dual_cndmask_b32 v16, v16, v1 +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v0, s6 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v0 :: v_dual_cndmask_b32 v6, v6, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v1, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v1, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, v0, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v1, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s5 ; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[7:10], off dlc @@ -2325,26 +2368,47 @@ ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: dyn_insertelement_v8f32_v_v_v_add_1: -; GFX10PLUS: ; %bb.0: ; %entry -; GFX10PLUS-NEXT: v_add_nc_u32_e32 v9, 1, v9 -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: dyn_insertelement_v8f32_v_v_v_add_1: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v8, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v8, s5 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v8f32_v_v_v_add_1: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_add_nc_u32_e32 v9, 1, v9 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo +; GFX11-NEXT: ; return to shader part epilog entry: %idx.add = add i32 %idx, 1 %insert = insertelement <8 x float> %vec, float %val, i32 %idx.add @@ -2373,26 +2437,47 @@ ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: dyn_insertelement_v8f32_v_v_v_add_7: -; GFX10PLUS: ; %bb.0: ; %entry -; GFX10PLUS-NEXT: v_add_nc_u32_e32 v9, 7, v9 -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: dyn_insertelement_v8f32_v_v_v_add_7: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_add_nc_u32_e32 v9, 7, v9 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v8, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v8, s5 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v8f32_v_v_v_add_7: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_add_nc_u32_e32 v9, 7, v9 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo +; GFX11-NEXT: ; return to shader part epilog entry: %idx.add = add i32 %idx, 7 %insert = insertelement <8 x float> %vec, float %val, i32 %idx.add @@ -2471,25 +2556,25 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 -; GFX10-NEXT: v_mov_b32_e32 v6, s6 -; GFX10-NEXT: v_mov_b32_e32 v7, s7 -; GFX10-NEXT: v_mov_b32_e32 v8, s8 -; GFX10-NEXT: v_mov_b32_e32 v9, s9 -; GFX10-NEXT: v_mov_b32_e32 v10, s10 -; GFX10-NEXT: v_mov_b32_e32 v11, s11 -; GFX10-NEXT: v_mov_b32_e32 v12, s12 -; GFX10-NEXT: v_mov_b32_e32 v13, s13 -; GFX10-NEXT: v_mov_b32_e32 v14, s14 -; GFX10-NEXT: v_mov_b32_e32 v15, s15 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: v_mov_b32_e32 v2, s14 +; GFX10-NEXT: v_mov_b32_e32 v3, s15 +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; @@ -2801,35 +2886,34 @@ ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v18, s8 -; GFX10-NEXT: v_mov_b32_e32 v10, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_mov_b32_e32 v11, s1 -; GFX10-NEXT: v_mov_b32_e32 v12, s2 -; GFX10-NEXT: v_mov_b32_e32 v13, s3 -; GFX10-NEXT: v_mov_b32_e32 v14, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v17, s8 +; GFX10-NEXT: v_mov_b32_e32 v16, s7 +; GFX10-NEXT: v_mov_b32_e32 v15, s6 +; GFX10-NEXT: v_mov_b32_e32 v14, s5 +; GFX10-NEXT: v_mov_b32_e32 v13, s4 +; GFX10-NEXT: v_mov_b32_e32 v12, s3 +; GFX10-NEXT: v_mov_b32_e32 v11, s2 +; GFX10-NEXT: v_mov_b32_e32 v10, s1 +; GFX10-NEXT: v_mov_b32_e32 v9, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 0, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_mov_b32_e32 v15, s5 -; GFX10-NEXT: v_mov_b32_e32 v16, s6 -; GFX10-NEXT: v_mov_b32_e32 v17, s7 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v11, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v12, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v14, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v17, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v18, v0, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 8, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v0, s7 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v10, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v11, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v12, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v13, v0, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v14, v0, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v17, v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: dyn_insertelement_v9f32_s_v_v: @@ -2914,27 +2998,49 @@ ; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: dyn_insertelement_v9f32_v_v_v: -; GFX10PLUS: ; %bb.0: ; %entry -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v10 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v10 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v10 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v10 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v10 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v10 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v10 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v10 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: dyn_insertelement_v9f32_v_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 8, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 0, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v9, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v9, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v9, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v9, s6 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v9f32_v_v_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo +; GFX11-NEXT: ; return to shader part epilog entry: %insert = insertelement <9 x float> %vec, float %val, i32 %idx ret <9 x float> %insert @@ -3082,36 +3188,35 @@ ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: v_mov_b32_e32 v19, s9 -; GFX10-NEXT: v_mov_b32_e32 v10, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_mov_b32_e32 v11, s1 -; GFX10-NEXT: v_mov_b32_e32 v12, s2 -; GFX10-NEXT: v_mov_b32_e32 v13, s3 +; GFX10-NEXT: v_mov_b32_e32 v18, s8 +; GFX10-NEXT: v_mov_b32_e32 v17, s7 +; GFX10-NEXT: v_mov_b32_e32 v16, s6 +; GFX10-NEXT: v_mov_b32_e32 v15, s5 ; GFX10-NEXT: v_mov_b32_e32 v14, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v13, s3 +; GFX10-NEXT: v_mov_b32_e32 v12, s2 +; GFX10-NEXT: v_mov_b32_e32 v11, s1 +; GFX10-NEXT: v_mov_b32_e32 v10, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 0, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_mov_b32_e32 v15, s5 -; GFX10-NEXT: v_mov_b32_e32 v16, s6 -; GFX10-NEXT: v_mov_b32_e32 v17, s7 -; GFX10-NEXT: v_mov_b32_e32 v18, s8 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v12, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v14, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v17, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v18, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v19, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 8, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 9, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v0, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v12, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v13, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v0, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v15, v0, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v16, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v17, v0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v18, v0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v19, v0, s7 ; GFX10-NEXT: v_mov_b32_e32 v0, v10 ; GFX10-NEXT: ; return to shader part epilog ; @@ -3135,8 +3240,10 @@ ; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX11-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 8, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v18, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v12, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v13, v0, vcc_lo @@ -3148,8 +3255,6 @@ ; GFX11-NEXT: v_cndmask_b32_e32 v6, v16, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v17, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v18, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v11 ; GFX11-NEXT: v_dual_cndmask_b32 v9, v19, v0 :: v_dual_mov_b32 v0, v10 @@ -3202,29 +3307,53 @@ ; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: dyn_insertelement_v10f32_v_v_v: -; GFX10PLUS: ; %bb.0: ; %entry -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v11 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v11 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v11 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v11 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v11 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v11 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v11 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v11 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc_lo -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: dyn_insertelement_v10f32_v_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 8, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 9, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 0, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v10, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v10, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v10, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v10, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v10, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v10, s7 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v10f32_v_v_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc_lo +; GFX11-NEXT: ; return to shader part epilog entry: %insert = insertelement <10 x float> %vec, float %val, i32 %idx ret <10 x float> %insert @@ -3381,41 +3510,40 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: v_mov_b32_e32 v22, s10 -; GFX10-NEXT: v_mov_b32_e32 v12, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_mov_b32_e32 v13, s1 -; GFX10-NEXT: v_mov_b32_e32 v14, s2 -; GFX10-NEXT: v_mov_b32_e32 v15, s3 -; GFX10-NEXT: v_mov_b32_e32 v16, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v21, s10 +; GFX10-NEXT: v_mov_b32_e32 v20, s9 +; GFX10-NEXT: v_mov_b32_e32 v19, s8 +; GFX10-NEXT: v_mov_b32_e32 v18, s7 +; GFX10-NEXT: v_mov_b32_e32 v17, s6 +; GFX10-NEXT: v_mov_b32_e32 v16, s5 +; GFX10-NEXT: v_mov_b32_e32 v15, s4 +; GFX10-NEXT: v_mov_b32_e32 v14, s3 +; GFX10-NEXT: v_mov_b32_e32 v13, s2 +; GFX10-NEXT: v_mov_b32_e32 v12, s1 +; GFX10-NEXT: v_mov_b32_e32 v11, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s9, 0, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_mov_b32_e32 v17, s5 -; GFX10-NEXT: v_mov_b32_e32 v18, s6 -; GFX10-NEXT: v_mov_b32_e32 v19, s7 -; GFX10-NEXT: v_mov_b32_e32 v20, s8 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v13, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX10-NEXT: v_mov_b32_e32 v21, s9 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v18, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v20, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v21, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v22, v0, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 8, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 9, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 10, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v0, s9 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v13, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v14, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v15, v0, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v16, v0, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v17, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v18, v0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v19, v0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v20, v0, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v21, v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: dyn_insertelement_v11f32_s_v_v: @@ -3440,10 +3568,16 @@ ; GFX11-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX11-NEXT: v_dual_mov_b32 v20, s8 :: v_dual_mov_b32 v19, s7 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 9, v1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 10, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v11, v13, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 7, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v21, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v22, v0, s1 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v19, v0, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc_lo @@ -3451,14 +3585,8 @@ ; GFX11-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v6, v18, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v20, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v21, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v1 -; GFX11-NEXT: v_dual_mov_b32 v1, v11 :: v_dual_cndmask_b32 v10, v22, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, v11 :: v_dual_cndmask_b32 v8, v20, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, v12 ; GFX11-NEXT: ; return to shader part epilog entry: @@ -3511,31 +3639,57 @@ ; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: dyn_insertelement_v11f32_v_v_v: -; GFX10PLUS: ; %bb.0: ; %entry -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v12 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v12 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v12 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v12 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v12 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v12 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v12 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: dyn_insertelement_v11f32_v_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 8, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 9, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 10, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s9, 0, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v11, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v11, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v11, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v11, s8 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v11f32_v_v_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo +; GFX11-NEXT: ; return to shader part epilog entry: %insert = insertelement <11 x float> %vec, float %val, i32 %idx ret <11 x float> %insert @@ -3704,42 +3858,41 @@ ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: v_mov_b32_e32 v23, s11 -; GFX10-NEXT: v_mov_b32_e32 v12, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_mov_b32_e32 v13, s1 -; GFX10-NEXT: v_mov_b32_e32 v14, s2 -; GFX10-NEXT: v_mov_b32_e32 v15, s3 +; GFX10-NEXT: v_mov_b32_e32 v22, s10 +; GFX10-NEXT: v_mov_b32_e32 v21, s9 +; GFX10-NEXT: v_mov_b32_e32 v20, s8 +; GFX10-NEXT: v_mov_b32_e32 v19, s7 +; GFX10-NEXT: v_mov_b32_e32 v18, s6 +; GFX10-NEXT: v_mov_b32_e32 v17, s5 ; GFX10-NEXT: v_mov_b32_e32 v16, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v15, s3 +; GFX10-NEXT: v_mov_b32_e32 v14, s2 +; GFX10-NEXT: v_mov_b32_e32 v13, s1 +; GFX10-NEXT: v_mov_b32_e32 v12, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s10, 0, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_mov_b32_e32 v17, s5 -; GFX10-NEXT: v_mov_b32_e32 v18, s6 -; GFX10-NEXT: v_mov_b32_e32 v19, s7 -; GFX10-NEXT: v_mov_b32_e32 v20, s8 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX10-NEXT: v_mov_b32_e32 v21, s9 -; GFX10-NEXT: v_mov_b32_e32 v22, s10 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v18, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v20, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v21, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v22, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v23, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 8, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 9, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 10, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s9, 11, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v0, s10 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v13, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v14, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v15, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v16, v0, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v17, v0, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v18, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v19, v0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v20, v0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v21, v0, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v22, v0, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v23, v0, s9 ; GFX10-NEXT: v_mov_b32_e32 v0, v12 ; GFX10-NEXT: ; return to shader part epilog ; @@ -3764,30 +3917,30 @@ ; GFX11-NEXT: v_dual_mov_b32 v17, s5 :: v_dual_mov_b32 v16, s4 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v18, s6 ; GFX11-NEXT: v_dual_mov_b32 v21, s9 :: v_dual_mov_b32 v20, s8 +; GFX11-NEXT: v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v18, s6 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 8, v1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 9, v1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 10, v1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 11, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v20, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v21, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v22, v0, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v18, v0, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v23, v0, s3 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v18, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v20, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v21, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v22, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v13 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v23, v0 :: v_dual_mov_b32 v0, v12 +; GFX11-NEXT: v_dual_cndmask_b32 v7, v19, v0 :: v_dual_mov_b32 v0, v12 ; GFX11-NEXT: ; return to shader part epilog entry: %insert = insertelement <12 x float> %vec, float %val, i32 %idx @@ -3841,33 +3994,61 @@ ; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: dyn_insertelement_v12f32_v_v_v: -; GFX10PLUS: ; %bb.0: ; %entry -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v13 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v13 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v13 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v13 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v13 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v13 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v13 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v13 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v13 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v13 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v13 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc_lo -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: dyn_insertelement_v12f32_v_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 8, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 9, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 10, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s9, 11, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s10, 0, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v12, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v12, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v12, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v12, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v12, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v12, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v12, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v12, s9 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v12f32_v_v_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc_lo +; GFX11-NEXT: ; return to shader part epilog entry: %insert = insertelement <12 x float> %vec, float %val, i32 %idx ret <12 x float> %insert @@ -5603,26 +5784,26 @@ ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: v_mov_b32_e32 v13, s6 -; GFX10-NEXT: v_mov_b32_e32 v7, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 0 -; GFX10-NEXT: v_mov_b32_e32 v8, s1 -; GFX10-NEXT: v_mov_b32_e32 v9, s2 -; GFX10-NEXT: v_mov_b32_e32 v10, s3 +; GFX10-NEXT: v_mov_b32_e32 v12, s5 ; GFX10-NEXT: v_mov_b32_e32 v11, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v10, s3 +; GFX10-NEXT: v_mov_b32_e32 v9, s2 +; GFX10-NEXT: v_mov_b32_e32 v8, s1 +; GFX10-NEXT: v_mov_b32_e32 v7, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, s9, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 1 -; GFX10-NEXT: v_mov_b32_e32 v12, s5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s9, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s9, 4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, s9, 5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, s9, 6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v0, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 5 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v12, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 6 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v13, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v9, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v11, v0, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v12, v0, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v13, v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; GFX10-NEXT: ; return to shader part epilog ; @@ -5704,29 +5885,28 @@ ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: v_mov_b32_e32 v14, s6 -; GFX10-NEXT: v_mov_b32_e32 v8, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, s1 -; GFX10-NEXT: v_mov_b32_e32 v10, s2 -; GFX10-NEXT: v_mov_b32_e32 v11, s3 -; GFX10-NEXT: v_mov_b32_e32 v12, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v13, s6 +; GFX10-NEXT: v_mov_b32_e32 v12, s5 +; GFX10-NEXT: v_mov_b32_e32 v11, s4 +; GFX10-NEXT: v_mov_b32_e32 v10, s3 +; GFX10-NEXT: v_mov_b32_e32 v9, s2 +; GFX10-NEXT: v_mov_b32_e32 v8, s1 +; GFX10-NEXT: v_mov_b32_e32 v7, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_mov_b32_e32 v13, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v0, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v9, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v11, v0, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v12, v0, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v13, v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: dyn_insertelement_v7f32_s_v_v: @@ -5823,23 +6003,41 @@ ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: dyn_insertelement_v7f32_v_v_v: -; GFX10PLUS: ; %bb.0: ; %entry -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: dyn_insertelement_v7f32_v_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v7, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v7, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v7, s4 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v7f32_v_v_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo +; GFX11-NEXT: ; return to shader part epilog entry: %insert = insertelement <7 x float> %vec, float %val, i32 %idx ret <7 x float> %insert @@ -6145,38 +6343,38 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, s2 ; GFX10-NEXT: v_mov_b32_e32 v4, s1 ; GFX10-NEXT: v_mov_b32_e32 v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 -; GFX10-NEXT: v_readfirstlane_b32 s2, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v12, v1, vcc_lo -; GFX10-NEXT: v_readfirstlane_b32 s3, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v13, v0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v14, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v16, v1, s1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v1, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v0, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v7, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v11, v0, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v13, v0, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v14, v1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v16, v1, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v2 +; GFX10-NEXT: v_readfirstlane_b32 s2, v3 ; GFX10-NEXT: v_readfirstlane_b32 s1, v4 -; GFX10-NEXT: v_readfirstlane_b32 s4, v7 +; GFX10-NEXT: v_readfirstlane_b32 s3, v6 +; GFX10-NEXT: v_readfirstlane_b32 s4, v5 ; GFX10-NEXT: v_readfirstlane_b32 s5, v8 -; GFX10-NEXT: v_readfirstlane_b32 s6, v9 +; GFX10-NEXT: v_readfirstlane_b32 s6, v7 ; GFX10-NEXT: v_readfirstlane_b32 s7, v10 -; GFX10-NEXT: v_readfirstlane_b32 s8, v11 -; GFX10-NEXT: v_readfirstlane_b32 s9, v2 -; GFX10-NEXT: v_readfirstlane_b32 s10, v12 +; GFX10-NEXT: v_readfirstlane_b32 s8, v9 +; GFX10-NEXT: v_readfirstlane_b32 s9, v12 +; GFX10-NEXT: v_readfirstlane_b32 s10, v11 ; GFX10-NEXT: v_readfirstlane_b32 s11, v13 ; GFX10-NEXT: v_readfirstlane_b32 s12, v0 ; GFX10-NEXT: v_readfirstlane_b32 s13, v1 @@ -6207,34 +6405,36 @@ ; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2 ; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 1, v2 ; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 -; GFX11-NEXT: v_readfirstlane_b32 s2, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v2, v12, v1 -; GFX11-NEXT: v_readfirstlane_b32 s3, v6 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v13, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v14, v1, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v15, v0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v16, v1, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v0, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v7, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v9, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v11, v0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v13, v0, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v15, v0, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v1, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v14, v1, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v16, v1, s4 ; GFX11-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-NEXT: v_readfirstlane_b32 s1, v4 -; GFX11-NEXT: v_readfirstlane_b32 s4, v7 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v6 +; GFX11-NEXT: v_readfirstlane_b32 s4, v5 ; GFX11-NEXT: v_readfirstlane_b32 s5, v8 -; GFX11-NEXT: v_readfirstlane_b32 s6, v9 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 ; GFX11-NEXT: v_readfirstlane_b32 s7, v10 -; GFX11-NEXT: v_readfirstlane_b32 s8, v11 -; GFX11-NEXT: v_readfirstlane_b32 s9, v2 -; GFX11-NEXT: v_readfirstlane_b32 s10, v12 +; GFX11-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-NEXT: v_readfirstlane_b32 s9, v12 +; GFX11-NEXT: v_readfirstlane_b32 s10, v11 ; GFX11-NEXT: v_readfirstlane_b32 s11, v13 ; GFX11-NEXT: v_readfirstlane_b32 s12, v0 ; GFX11-NEXT: v_readfirstlane_b32 s13, v1 @@ -6377,35 +6577,39 @@ ; GFX11-LABEL: dyn_insertelement_v7f64_v_v_v: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v16 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v16 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v16 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 2, v16 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 3, v16 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 4, v16 +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 5, v16 +; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 6, v16 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v14, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v15, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v14, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v15, s1 -; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v14 :: v_dual_cndmask_b32 v3, v3, v15 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v14, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v14, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v14, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v14, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v14, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v14, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v15, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v15, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v15, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v15, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v15, s5 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3 -; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v14 :: v_dual_cndmask_b32 v5, v5, v15 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 -; GFX11-NEXT: v_readfirstlane_b32 s10, v10 -; GFX11-NEXT: v_readfirstlane_b32 s11, v11 ; GFX11-NEXT: v_readfirstlane_b32 s4, v4 ; GFX11-NEXT: v_readfirstlane_b32 s5, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v14 :: v_dual_cndmask_b32 v7, v7, v15 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 -; GFX11-NEXT: v_readfirstlane_b32 s12, v12 -; GFX11-NEXT: v_readfirstlane_b32 s13, v13 ; GFX11-NEXT: v_readfirstlane_b32 s6, v6 ; GFX11-NEXT: v_readfirstlane_b32 s7, v7 -; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v14 :: v_dual_cndmask_b32 v9, v9, v15 ; GFX11-NEXT: v_readfirstlane_b32 s8, v8 ; GFX11-NEXT: v_readfirstlane_b32 s9, v9 +; GFX11-NEXT: v_readfirstlane_b32 s10, v10 +; GFX11-NEXT: v_readfirstlane_b32 s11, v11 +; GFX11-NEXT: v_readfirstlane_b32 s12, v12 +; GFX11-NEXT: v_readfirstlane_b32 s13, v13 ; GFX11-NEXT: ; return to shader part epilog entry: %insert = insertelement <7 x double> %vec, double %val, i32 %idx @@ -6517,24 +6721,24 @@ ; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v0, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s12, 3 -; GFX10-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v10, v0, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v1, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s12, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s12, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s12, 4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, s12, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v0, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v10, v0, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v1, s2 ; GFX10-NEXT: v_readfirstlane_b32 s0, v2 ; GFX10-NEXT: v_readfirstlane_b32 s1, v3 +; GFX10-NEXT: v_readfirstlane_b32 s2, v4 ; GFX10-NEXT: v_readfirstlane_b32 s3, v5 ; GFX10-NEXT: v_readfirstlane_b32 s4, v6 ; GFX10-NEXT: v_readfirstlane_b32 s5, v7 @@ -6662,28 +6866,28 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, s2 ; GFX10-NEXT: v_mov_b32_e32 v4, s1 ; GFX10-NEXT: v_mov_b32_e32 v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 4, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 -; GFX10-NEXT: v_readfirstlane_b32 s2, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v8, v9, v0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v0, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v12, v1, s1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v0, s3 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v7, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v0, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, v1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v12, v1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v2 +; GFX10-NEXT: v_readfirstlane_b32 s2, v3 ; GFX10-NEXT: v_readfirstlane_b32 s1, v4 ; GFX10-NEXT: v_readfirstlane_b32 s3, v6 -; GFX10-NEXT: v_readfirstlane_b32 s4, v7 -; GFX10-NEXT: v_readfirstlane_b32 s5, v2 -; GFX10-NEXT: v_readfirstlane_b32 s6, v8 +; GFX10-NEXT: v_readfirstlane_b32 s4, v5 +; GFX10-NEXT: v_readfirstlane_b32 s5, v8 +; GFX10-NEXT: v_readfirstlane_b32 s6, v7 ; GFX10-NEXT: v_readfirstlane_b32 s7, v9 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -6768,55 +6972,57 @@ ; GFX10-LABEL: dyn_insertelement_v5f64_v_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s2, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, s2, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s2, 4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v10, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v10, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v11, s2 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 -; GFX10-NEXT: v_readfirstlane_b32 s8, v8 -; GFX10-NEXT: v_readfirstlane_b32 s9, v9 -; GFX10-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s6, v6 ; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: v_readfirstlane_b32 s8, v8 +; GFX10-NEXT: v_readfirstlane_b32 s9, v9 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: dyn_insertelement_v5f64_v_v_s: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s2, 2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s2, 4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s2, 1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s2, 2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, s2, 3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, s2, 4 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11 -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s1 -; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11 -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v10, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v10, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v11, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v11, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s2 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v10 :: v_dual_cndmask_b32 v7, v7, v11 ; GFX11-NEXT: v_readfirstlane_b32 s4, v4 ; GFX11-NEXT: v_readfirstlane_b32 s5, v5 -; GFX11-NEXT: v_readfirstlane_b32 s8, v8 ; GFX11-NEXT: v_readfirstlane_b32 s6, v6 ; GFX11-NEXT: v_readfirstlane_b32 s7, v7 +; GFX11-NEXT: v_readfirstlane_b32 s8, v8 ; GFX11-NEXT: v_readfirstlane_b32 s9, v9 ; GFX11-NEXT: ; return to shader part epilog entry: @@ -6857,55 +7063,57 @@ ; GFX10-LABEL: dyn_insertelement_v5f64_v_v_v: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 4, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 2, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 3, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 4, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v10, s3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v10, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v11, s3 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 -; GFX10-NEXT: v_readfirstlane_b32 s8, v8 -; GFX10-NEXT: v_readfirstlane_b32 s9, v9 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s6, v6 ; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: v_readfirstlane_b32 s8, v8 +; GFX10-NEXT: v_readfirstlane_b32 s9, v9 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: dyn_insertelement_v5f64_v_v_v: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 4, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 2, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 3, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 4, v12 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s1 -; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v10, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v10, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v11, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v11, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s3 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v10 :: v_dual_cndmask_b32 v7, v7, v11 ; GFX11-NEXT: v_readfirstlane_b32 s4, v4 ; GFX11-NEXT: v_readfirstlane_b32 s5, v5 -; GFX11-NEXT: v_readfirstlane_b32 s8, v8 ; GFX11-NEXT: v_readfirstlane_b32 s6, v6 ; GFX11-NEXT: v_readfirstlane_b32 s7, v7 +; GFX11-NEXT: v_readfirstlane_b32 s8, v8 ; GFX11-NEXT: v_readfirstlane_b32 s9, v9 ; GFX11-NEXT: ; return to shader part epilog entry: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -2204,16 +2204,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 40 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -2305,13 +2305,13 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 40 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_addr64: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll @@ -349,10 +349,8 @@ ; ; GFX10-LABEL: gather4_c_b_cl_o_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s1, exec_lo +; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -364,6 +362,8 @@ ; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_c_b_cl_o v[0:3], v[0:5], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -48,19 +48,33 @@ } define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { -; GFX10-LABEL: image_bvh_intersect_ray_a16: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v10, 0xffff, v7 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX10-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GFX10-NEXT: v_and_or_b32 v5, v5, 0xffff, v9 -; GFX10-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 -; GFX10-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX1030-LABEL: image_bvh_intersect_ray_a16: +; GFX1030: ; %bb.0: +; GFX1030-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX1030-NEXT: v_and_b32_e32 v10, 0xffff, v7 +; GFX1030-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX1030-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX1030-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX1030-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; GFX1030-NEXT: v_and_or_b32 v5, v5, 0xffff, v9 +; GFX1030-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: ; return to shader part epilog +; +; GFX1013-LABEL: image_bvh_intersect_ray_a16: +; GFX1013: ; %bb.0: +; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX1013-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX1013-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX1013-NEXT: v_and_or_b32 v5, v5, 0xffff, v9 +; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GFX1013-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; GFX1013-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v9 +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 +; GFX1013-NEXT: s_waitcnt vmcnt(0) +; GFX1013-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: image_bvh_intersect_ray_a16: ; GFX11: ; %bb.0: @@ -108,19 +122,33 @@ } define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { -; GFX10-LABEL: image_bvh64_intersect_ray_a16: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v11, 0xffff, v8 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX10-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GFX10-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 -; GFX10-NEXT: v_and_or_b32 v7, v7, 0xffff, v11 -; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX1030-LABEL: image_bvh64_intersect_ray_a16: +; GFX1030: ; %bb.0: +; GFX1030-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GFX1030-NEXT: v_and_b32_e32 v11, 0xffff, v8 +; GFX1030-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX1030-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX1030-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX1030-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; GFX1030-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 +; GFX1030-NEXT: v_and_or_b32 v7, v7, 0xffff, v11 +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: ; return to shader part epilog +; +; GFX1013-LABEL: image_bvh64_intersect_ray_a16: +; GFX1013: ; %bb.0: +; GFX1013-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 +; GFX1013-NEXT: v_and_b32_e32 v10, 0xffff, v8 +; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX1013-NEXT: v_and_or_b32 v7, v7, 0xffff, v10 +; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 +; GFX1013-NEXT: s_waitcnt vmcnt(0) +; GFX1013-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16: ; GFX11: ; %bb.0: @@ -194,8 +222,8 @@ ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh_intersect_ray v[15:18], v[0:10], s[4:7] ; GFX1013-NEXT: ; implicit-def: $vgpr11 -; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 ; GFX1013-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 +; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB6_1 @@ -248,19 +276,19 @@ ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: v_mov_b32_e32 v13, v0 -; GFX1030-NEXT: v_mov_b32_e32 v14, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; GFX1030-NEXT: v_mov_b32_e32 v14, v1 ; GFX1030-NEXT: v_mov_b32_e32 v15, v2 -; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v8 ; GFX1030-NEXT: v_mov_b32_e32 v16, v3 -; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX1030-NEXT: v_mov_b32_e32 v17, v4 -; GFX1030-NEXT: v_alignbit_b32 v20, v2, v7, 16 +; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: v_and_or_b32 v18, v5, 0xffff, v0 -; GFX1030-NEXT: v_and_or_b32 v19, v6, 0xffff, v1 +; GFX1030-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1030-NEXT: v_and_or_b32 v19, v6, 0xffff, v0 +; GFX1030-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX1030-NEXT: v_alignbit_b32 v20, v0, v7, 16 ; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v10 @@ -291,14 +319,14 @@ ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; GFX1013-NEXT: v_and_b32_e32 v14, 0xffff, v7 ; GFX1013-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX1013-NEXT: v_alignbit_b32 v7, v8, v7, 16 ; GFX1013-NEXT: v_and_or_b32 v5, v5, 0xffff, v13 -; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v14 +; GFX1013-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX1013-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; GFX1013-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v13 ; GFX1013-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v10 @@ -310,8 +338,8 @@ ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh_intersect_ray v[13:16], v[0:7], s[4:7] a16 ; GFX1013-NEXT: ; implicit-def: $vgpr9 -; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1013-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 +; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB7_1 @@ -423,8 +451,8 @@ ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh64_intersect_ray v[16:19], v[0:11], s[4:7] ; GFX1013-NEXT: ; implicit-def: $vgpr12 -; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX1013-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 +; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB8_1 @@ -477,20 +505,20 @@ ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: v_mov_b32_e32 v14, v0 -; GFX1030-NEXT: v_mov_b32_e32 v15, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX1030-NEXT: v_mov_b32_e32 v15, v1 ; GFX1030-NEXT: v_mov_b32_e32 v16, v2 -; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; GFX1030-NEXT: v_mov_b32_e32 v17, v3 -; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX1030-NEXT: v_mov_b32_e32 v18, v4 +; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_mov_b32_e32 v19, v5 -; GFX1030-NEXT: v_alignbit_b32 v22, v2, v8, 16 -; GFX1030-NEXT: v_and_or_b32 v20, v6, 0xffff, v0 -; GFX1030-NEXT: v_and_or_b32 v21, v7, 0xffff, v1 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo +; GFX1030-NEXT: v_and_or_b32 v20, v6, 0xffff, v0 +; GFX1030-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1030-NEXT: v_and_or_b32 v21, v7, 0xffff, v0 +; GFX1030-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX1030-NEXT: v_alignbit_b32 v22, v0, v8, 16 ; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v11 @@ -522,14 +550,14 @@ ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; GFX1013-NEXT: v_and_b32_e32 v15, 0xffff, v8 ; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX1013-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16 ; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v14 -; GFX1013-NEXT: v_and_or_b32 v7, v7, 0xffff, v15 +; GFX1013-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX1013-NEXT: v_and_or_b32 v7, v7, 0xffff, v14 ; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v10 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v11 @@ -541,8 +569,8 @@ ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh64_intersect_ray v[14:17], v[0:8], s[4:7] a16 ; GFX1013-NEXT: ; implicit-def: $vgpr10 -; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 ; GFX1013-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 +; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB9_1 @@ -627,7 +655,8 @@ ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; GFX1013-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 ; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 @@ -635,15 +664,14 @@ ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, s0 ; GFX1013-NEXT: v_mov_b32_e32 v1, s1 -; GFX1013-NEXT: v_mov_b32_e32 v2, s2 -; GFX1013-NEXT: v_mov_b32_e32 v3, s3 -; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 +; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v0, v4 +; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX1013-NEXT: v_mov_b32_e32 v0, s2 +; GFX1013-NEXT: v_mov_b32_e32 v1, s3 +; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 -; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 -; GFX1013-NEXT: flat_load_dword v0, v[4:5] -; GFX1013-NEXT: flat_load_dword v1, v[2:3] +; GFX1013-NEXT: flat_load_dword v0, v[2:3] +; GFX1013-NEXT: flat_load_dword v1, v[4:5] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 @@ -757,31 +785,31 @@ ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; GFX1013-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1013-NEXT: s_movk_i32 s9, 0x4600 ; GFX1013-NEXT: s_movk_i32 s8, 0x4700 ; GFX1013-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, s0 ; GFX1013-NEXT: v_mov_b32_e32 v1, s1 -; GFX1013-NEXT: v_mov_b32_e32 v2, s2 -; GFX1013-NEXT: v_mov_b32_e32 v3, s3 ; GFX1013-NEXT: s_movk_i32 s1, 0x4400 -; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 -; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 -; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX1013-NEXT: s_movk_i32 s2, 0x4200 -; GFX1013-NEXT: flat_load_dword v0, v[4:5] -; GFX1013-NEXT: flat_load_dword v1, v[2:3] +; GFX1013-NEXT: s_movk_i32 s0, 0x4500 ; GFX1013-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v0, v4 +; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX1013-NEXT: v_mov_b32_e32 v0, s2 +; GFX1013-NEXT: v_mov_b32_e32 v1, s3 +; GFX1013-NEXT: s_movk_i32 s2, 0x4200 ; GFX1013-NEXT: s_movk_i32 s3, 0x4800 ; GFX1013-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4 +; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX1013-NEXT: flat_load_dword v0, v[2:3] +; GFX1013-NEXT: flat_load_dword v1, v[4:5] ; GFX1013-NEXT: s_lshl_b32 s1, s1, 16 -; GFX1013-NEXT: s_movk_i32 s0, 0x4500 +; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX1013-NEXT: s_or_b32 s1, s2, s1 ; GFX1013-NEXT: s_bfe_u32 s2, s9, 0x100000 -; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX1013-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX1013-NEXT: s_lshl_b32 s2, s2, 16 ; GFX1013-NEXT: s_lshl_b32 s3, s3, 16 @@ -1007,44 +1035,43 @@ ; ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1013-NEXT: s_movk_i32 s1, 0x4400 -; GFX1013-NEXT: s_movk_i32 s9, 0x4600 -; GFX1013-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX1013-NEXT: s_movk_i32 s0, 0x4500 -; GFX1013-NEXT: s_lshl_b32 s1, s1, 16 -; GFX1013-NEXT: s_movk_i32 s8, 0x4700 -; GFX1013-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX1013-NEXT: s_movk_i32 s7, 0x4400 +; GFX1013-NEXT: s_movk_i32 s8, 0x4200 +; GFX1013-NEXT: s_movk_i32 s9, 0x4800 +; GFX1013-NEXT: s_movk_i32 s11, 0x4600 +; GFX1013-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX1013-NEXT: s_movk_i32 s6, 0x4500 +; GFX1013-NEXT: s_movk_i32 s10, 0x4700 ; GFX1013-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX1013-NEXT: s_bfe_u32 s11, s11, 0x100000 +; GFX1013-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s2 -; GFX1013-NEXT: v_mov_b32_e32 v1, s3 -; GFX1013-NEXT: s_movk_i32 s2, 0x4200 -; GFX1013-NEXT: s_movk_i32 s3, 0x4800 -; GFX1013-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX1013-NEXT: v_mov_b32_e32 v0, s4 +; GFX1013-NEXT: v_mov_b32_e32 v1, s5 +; GFX1013-NEXT: s_lshl_b32 s4, s7, 16 +; GFX1013-NEXT: s_bfe_u32 s7, s9, 0x100000 +; GFX1013-NEXT: s_lshl_b32 s5, s11, 16 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX1013-NEXT: s_or_b32 s1, s2, s1 -; GFX1013-NEXT: s_bfe_u32 s2, s9, 0x100000 -; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX1013-NEXT: s_or_b32 s4, s8, s4 +; GFX1013-NEXT: s_bfe_u32 s8, s10, 0x100000 +; GFX1013-NEXT: s_lshl_b32 s7, s7, 16 ; GFX1013-NEXT: flat_load_dword v2, v[0:1] -; GFX1013-NEXT: s_lshl_b32 s2, s2, 16 -; GFX1013-NEXT: s_lshl_b32 s3, s3, 16 -; GFX1013-NEXT: s_or_b32 s0, s0, s2 -; GFX1013-NEXT: s_or_b32 s2, s8, s3 +; GFX1013-NEXT: s_or_b32 s5, s6, s5 +; GFX1013-NEXT: s_or_b32 s6, s8, s7 ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 ; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 -; GFX1013-NEXT: v_mov_b32_e32 v6, s1 -; GFX1013-NEXT: v_mov_b32_e32 v7, s0 -; GFX1013-NEXT: v_mov_b32_e32 v8, s2 +; GFX1013-NEXT: v_mov_b32_e32 v6, s4 +; GFX1013-NEXT: v_mov_b32_e32 v7, s5 +; GFX1013-NEXT: v_mov_b32_e32 v8, s6 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16 +; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm @@ -1097,3 +1124,5 @@ store <4 x i32> %v, ptr undef ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -70,16 +70,16 @@ ; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v3, v4, 0xff, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 -; GFX10-NEXT: v_or3_b32 v1, v3, v4, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX10-NEXT: v_and_or_b32 v1, v4, 0xff, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_or3_b32 v1, v1, v2, v3 ; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %a.cast = bitcast <4 x i8> %a to i32 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -71,16 +71,16 @@ ; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v3, v4, 0xff, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 -; GFX10-NEXT: v_or3_b32 v1, v3, v4, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX10-NEXT: v_and_or_b32 v1, v4, 0xff, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_or3_b32 v1, v1, v2, v3 ; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll @@ -151,48 +151,47 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_read_u8 v1, v0 ; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 -; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 -; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v3, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v4, v0 offset:2 ; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 ; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 -; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 -; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 -; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 -; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 -; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 -; GFX10-NEXT: ds_read_u8 v12, v0 offset:11 -; GFX10-NEXT: ds_read_u8 v13, v0 offset:12 -; GFX10-NEXT: ds_read_u8 v14, v0 offset:13 -; GFX10-NEXT: ds_read_u8 v15, v0 offset:15 -; GFX10-NEXT: ds_read_u8 v0, v0 offset:14 -; GFX10-NEXT: s_waitcnt lgkmcnt(14) -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(13) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(12) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v4 -; GFX10-NEXT: s_waitcnt lgkmcnt(10) -; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; GFX10-NEXT: s_waitcnt lgkmcnt(9) -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX10-NEXT: s_waitcnt lgkmcnt(8) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:7 +; GFX10-NEXT: ds_read_u8 v8, v0 offset:6 ; GFX10-NEXT: s_waitcnt lgkmcnt(6) -; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(5) -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_or3_b32 v4, v2, v3, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_lshl_or_b32 v10, v14, 8, v13 +; GFX10-NEXT: v_lshl_or_b32 v1, v6, 8, v5 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v15 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v7 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; GFX10-NEXT: v_or3_b32 v1, v2, v3, v1 +; GFX10-NEXT: ds_read_u8 v2, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v3, v0 offset:9 +; GFX10-NEXT: ds_read_u8 v5, v0 offset:11 +; GFX10-NEXT: ds_read_u8 v6, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:12 +; GFX10-NEXT: ds_read_u8 v8, v0 offset:13 +; GFX10-NEXT: ds_read_u8 v9, v0 offset:15 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:14 +; GFX10-NEXT: s_waitcnt lgkmcnt(6) +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 8, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(5) +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v5 +; GFX10-NEXT: s_waitcnt lgkmcnt(4) +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v0 -; GFX10-NEXT: v_or3_b32 v0, v2, v3, v1 -; GFX10-NEXT: v_or3_b32 v1, v5, v6, v4 -; GFX10-NEXT: v_or3_b32 v2, v8, v9, v7 -; GFX10-NEXT: v_or3_b32 v3, v11, v12, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_or3_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 8, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; GFX10-NEXT: v_or3_b32 v3, v5, v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: load_lds_v4i32_align1: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -130,37 +130,36 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_read_u8 v1, v0 ; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 -; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 -; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v3, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v4, v0 offset:2 ; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 ; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 -; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 -; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 -; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 -; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 -; GFX10-NEXT: ds_read_u8 v11, v0 offset:11 -; GFX10-NEXT: ds_read_u8 v0, v0 offset:10 -; GFX10-NEXT: s_waitcnt lgkmcnt(10) -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(9) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(8) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:7 +; GFX10-NEXT: ds_read_u8 v8, v0 offset:6 ; GFX10-NEXT: s_waitcnt lgkmcnt(6) -; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(5) -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX10-NEXT: v_or3_b32 v3, v2, v3, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v6, 8, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v7 +; GFX10-NEXT: v_or3_b32 v1, v2, v4, v1 +; GFX10-NEXT: ds_read_u8 v2, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v4, v0 offset:9 +; GFX10-NEXT: ds_read_u8 v5, v0 offset:11 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:10 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9 +; GFX10-NEXT: v_lshl_or_b32 v2, v4, 8, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX10-NEXT: v_or3_b32 v0, v2, v3, v1 -; GFX10-NEXT: v_or3_b32 v1, v5, v6, v4 -; GFX10-NEXT: v_or3_b32 v2, v8, v9, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_or3_b32 v2, v4, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: load_lds_v3i32_align1: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -482,8 +482,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v16, v0 +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, v17, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v18, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v19, v3 @@ -499,15 +499,15 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v13, v29, v13 ; GFX10-NEXT: v_lshrrev_b32_e32 v14, v30, v14 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v15, v31, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v15, v16, v15 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_lshr_v16i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v16, v0 +; GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, v17, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v18, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v19, v3 @@ -523,7 +523,7 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v13, v29, v13 ; GFX11-NEXT: v_lshrrev_b32_e32 v14, v30, v14 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v15, v31, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, v16, v15 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = lshr <16 x i32> %value, %amount ret <16 x i32> %result @@ -1668,23 +1668,23 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v3 -; GFX10-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[8:9], v3, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v3 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], v2, v[4:5] -; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] -; GFX10-NEXT: v_lshrrev_b64 v[4:5], v3, v[4:5] -; GFX10-NEXT: v_or_b32_e32 v2, v6, v8 -; GFX10-NEXT: v_or_b32_e32 v6, v7, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b64 v[6:7], v2, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v2, v8, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 64, v3 +; GFX10-NEXT: v_or_b32_e32 v8, v9, v7 +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v1, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v3, v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v1, v6, v1, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_lshr_i65: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -8,17 +8,17 @@ ; GFX10-LABEL: v_mul_i64_zext_00: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, v3 -; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, v1 -; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[2:3] +; GFX10-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX10-NEXT: v_mul_lo_u32 v5, v1, v2 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v0, v2, 0 +; GFX10-NEXT: v_add3_u32 v1, v1, v3, v5 +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_zext_00: @@ -62,11 +62,11 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-NEXT: global_load_dword v4, v3, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v0, v4, 0 -; GFX10-NEXT: v_mul_lo_u32 v0, v1, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5] +; GFX10-NEXT: v_mul_lo_u32 v2, v1, v4 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v0, v4, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_zext_01: @@ -111,11 +111,11 @@ ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] ; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0 -; GFX10-NEXT: v_mul_lo_u32 v0, v4, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5] +; GFX10-NEXT: v_mul_lo_u32 v2, v4, v1 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v4, v0, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_zext_10: @@ -203,14 +203,14 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v4, v2, s[6:7] +; GFX10-NEXT: global_load_dword v3, v2, s[6:7] ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0 -; GFX10-NEXT: v_mul_lo_u32 v0, v4, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5] +; GFX10-NEXT: v_mul_lo_u32 v2, v3, v1 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v3, v0, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_and_a_hi: @@ -247,21 +247,20 @@ ; GFX10-LABEL: v_mul_i64_and_a_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, 0, v0, 0 -; GFX10-NEXT: v_mul_lo_u32 v1, 0, v1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mul_lo_u32 v0, v3, v0 -; GFX10-NEXT: v_add3_u32 v5, v5, v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[4:5], s[4:5] +; GFX10-NEXT: v_mul_lo_u32 v4, v1, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, 0, v3 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, 0, v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_add3_u32 v1, v1, v3, v4 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_and_a_lo: @@ -308,14 +307,13 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, 0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mul_lo_u32 v0, v0, v3 -; GFX10-NEXT: v_mul_lo_u32 v1, v1, 0 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[4:5], s[4:5] +; GFX10-NEXT: v_mul_lo_u32 v2, v0, v3 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v0, 0, 0 +; GFX10-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_and_b_lo: @@ -395,14 +393,14 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v4, 0xfff00000, v0 -; GFX10-NEXT: v_and_b32_e32 v5, 0xf00f, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xf00f, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xfff00000, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v4, v2, 0 -; GFX10-NEXT: v_mul_lo_u32 v3, v4, v3 -; GFX10-NEXT: v_mul_lo_u32 v2, v5, v2 -; GFX10-NEXT: v_add3_u32 v1, v1, v3, v2 +; GFX10-NEXT: v_mul_lo_u32 v4, v1, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v0, v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_add3_u32 v1, v1, v3, v4 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -503,21 +501,21 @@ ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mul_lo_u32 v5, v2, v5 +; GFX10-NEXT: v_mul_lo_u32 v3, v2, v5 ; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX10-NEXT: ; %bb.1: ; %else ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, v2, v4, 0 ; GFX10-NEXT: v_mul_lo_u32 v2, 0, v4 -; GFX10-NEXT: v_add3_u32 v1, v1, v5, v2 +; GFX10-NEXT: v_add3_u32 v1, v1, v3, v2 ; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX10-NEXT: ; implicit-def: $vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr3 ; GFX10-NEXT: ; %bb.2: ; %Flow ; GFX10-NEXT: s_andn2_saveexec_b32 s0, s0 ; GFX10-NEXT: ; %bb.3: ; %if ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, v2, 0, 0 ; GFX10-NEXT: v_mul_lo_u32 v2, 0, 0 -; GFX10-NEXT: v_add3_u32 v1, v1, v5, v2 +; GFX10-NEXT: v_add3_u32 v1, v1, v3, v2 ; GFX10-NEXT: ; %bb.4: ; %endif ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -383,12 +383,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0 -; GFX10-NEXT: v_mul_lo_u32 v3, v4, v3 -; GFX10-NEXT: v_mul_lo_u32 v2, v5, v2 -; GFX10-NEXT: v_add3_u32 v1, v1, v3, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, v1, v2 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v0, v2, 0 +; GFX10-NEXT: v_add3_u32 v1, v1, v3, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i64: @@ -523,11 +521,11 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v6, v0 ; GFX10-NEXT: v_mov_b32_e32 v7, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v2, v3 -; GFX10-NEXT: v_mul_lo_u32 v5, v6, v5 -; GFX10-NEXT: v_mul_lo_u32 v8, v7, v4 +; GFX10-NEXT: v_mul_lo_u32 v1, v2, v3 +; GFX10-NEXT: v_mul_lo_u32 v0, v6, v5 +; GFX10-NEXT: v_mul_lo_u32 v2, v7, v4 +; GFX10-NEXT: v_add3_u32 v2, v0, v2, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0 -; GFX10-NEXT: v_add3_u32 v2, v5, v8, v2 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2] ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2] ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -798,9 +796,9 @@ ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v6, 0 ; GFX10-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v9, v5, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v10, v4, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12] ; GFX10-NEXT: v_mov_b32_e32 v2, v11 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] ; GFX10-NEXT: v_mul_lo_u32 v5, v10, v5 @@ -1855,69 +1853,69 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v16, v0 ; GFX10-NEXT: v_mov_b32_e32 v17, v1 -; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX10-NEXT: v_mul_lo_u32 v25, v6, v9 +; GFX10-NEXT: v_mul_lo_u32 v26, v5, v10 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0 ; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v12, 0 -; GFX10-NEXT: v_mul_lo_u32 v30, v17, v14 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19] ; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v16, v10, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v20, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21] -; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 -; GFX10-NEXT: v_mov_b32_e32 v20, v22 -; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20] -; GFX10-NEXT: v_mov_b32_e32 v20, v18 -; GFX10-NEXT: v_mov_b32_e32 v19, v22 -; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX10-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0 -; GFX10-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25] -; GFX10-NEXT: v_mul_lo_u32 v25, v3, v12 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15] -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 -; GFX10-NEXT: v_mul_lo_u32 v24, v2, v13 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v13, v1 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12] -; GFX10-NEXT: v_mov_b32_e32 v14, v21 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 -; GFX10-NEXT: v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14] -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s8, 0, v6, s8 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13] -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v3, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v4, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v14, v5, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v6, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v23, v22, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v30, s8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s6 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s7 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v20, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v6, v8, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, v19 +; GFX10-NEXT: v_mov_b32_e32 v23, v18 +; GFX10-NEXT: v_mul_lo_u32 v18, v4, v11 +; GFX10-NEXT: v_mov_b32_e32 v1, v20 +; GFX10-NEXT: v_mad_u64_u32 v[19:20], s4, v16, v10, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v16, v13, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[19:20], s4, v17, v9, v[19:20] +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v17, v12, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[19:20], s4, v2, v8, v[19:20] +; GFX10-NEXT: v_add_co_ci_u32_e64 v27, s4, 0, v6, s4 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v11, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-NEXT: v_mul_lo_u32 v20, v3, v12 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, v3, v10, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[22:23], s7, v16, v11, v[22:23] +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s7 +; GFX10-NEXT: v_mad_u64_u32 v[10:11], s7, v17, v10, v[22:23] +; GFX10-NEXT: v_mul_lo_u32 v22, v2, v13 +; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v4, v9, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s9, v16, v8, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s7, 0, v6, s7 +; GFX10-NEXT: v_mad_u64_u32 v[10:11], s7, v2, v9, v[10:11] +; GFX10-NEXT: v_mov_b32_e32 v2, v19 +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s9, v5, v8, v[12:13] +; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s7, 0, v4, s7 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, v16, v9, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s7, v3, v8, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s10 +; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s7, 0, v12, s7 +; GFX10-NEXT: v_mul_lo_u32 v9, v17, v14 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s7, v17, v8, v[1:2] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, v11, v3, s7 +; GFX10-NEXT: v_mul_lo_u32 v11, v16, v15 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s7, v27, v4, s7 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s7, v10, v5, s7 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s7, v24, v6, s7 +; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s7, v21, v11, s7 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s7, v10, v9, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s7, v9, v22, s8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v20, s6 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v18, s4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v26, s5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v25, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v7, v8, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1926,69 +1924,68 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 +; GFX11-NEXT: v_mul_lo_u32 v25, v6, v9 +; GFX11-NEXT: v_mul_lo_u32 v26, v5, v10 ; GFX11-NEXT: v_mul_lo_u32 v7, v7, v8 -; GFX11-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0 ; GFX11-NEXT: v_mad_u64_u32 v[18:19], null, v16, v12, 0 -; GFX11-NEXT: v_mul_lo_u32 v30, v17, v14 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[18:19], s0, v17, v11, v[18:19] ; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v20, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX11-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] -; GFX11-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v8, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 -; GFX11-NEXT: v_mov_b32_e32 v20, v22 -; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[19:20] -; GFX11-NEXT: v_mov_b32_e32 v20, v18 -; GFX11-NEXT: v_mov_b32_e32 v19, v22 -; GFX11-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[19:20] +; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v6, v8, v[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, v19 :: v_dual_mov_b32 v1, v20 +; GFX11-NEXT: v_mad_u64_u32 v[19:20], null, v16, v10, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v16, v13, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[19:20], s0, v17, v9, v[19:20] +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s1, v17, v12, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[19:20], s0, v2, v8, v[19:20] +; GFX11-NEXT: v_mov_b32_e32 v23, v18 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, s0, 0, v6, s0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v2, v11, v[0:1] +; GFX11-NEXT: v_mul_lo_u32 v18, v4, v11 +; GFX11-NEXT: v_mov_b32_e32 v22, v20 +; GFX11-NEXT: v_mul_lo_u32 v20, v3, v12 +; GFX11-NEXT: v_mad_u64_u32 v[22:23], s3, v16, v11, v[22:23] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s2, v3, v10, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s3 +; GFX11-NEXT: v_mad_u64_u32 v[10:11], s3, v17, v10, v[22:23] +; GFX11-NEXT: v_mul_lo_u32 v22, v2, v13 +; GFX11-NEXT: v_mad_u64_u32 v[12:13], s4, v4, v9, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v8, 0 -; GFX11-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 -; GFX11-NEXT: v_mad_u64_u32 v[18:19], s1, v2, v11, v[24:25] -; GFX11-NEXT: v_mul_lo_u32 v25, v3, v12 -; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15] -; GFX11-NEXT: v_mov_b32_e32 v14, v21 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 -; GFX11-NEXT: v_mad_u64_u32 v[18:19], s3, v3, v10, v[18:19] -; GFX11-NEXT: v_mul_lo_u32 v24, v2, v13 -; GFX11-NEXT: v_mov_b32_e32 v13, v1 -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[11:12] -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 -; GFX11-NEXT: v_mad_u64_u32 v[10:11], s2, v4, v9, v[18:19] -; GFX11-NEXT: v_mad_u64_u32 v[12:13], s4, v16, v9, v[13:14] -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v8, v[1:2] -; GFX11-NEXT: v_add_co_ci_u32_e64 v14, s4, 0, v6, s4 -; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v8, v[10:11] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v8, v[12:13] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s5, v23, v22, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v30, s4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v24, s2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1 -; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s3, 0, v6, s3 +; GFX11-NEXT: v_mad_u64_u32 v[10:11], s3, v2, v9, v[10:11] +; GFX11-NEXT: v_mov_b32_e32 v2, v19 +; GFX11-NEXT: v_mad_u64_u32 v[5:6], s5, v5, v8, v[12:13] +; GFX11-NEXT: v_add_co_ci_u32_e64 v12, s3, 0, v4, s3 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s6, v16, v9, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[3:4], s3, v3, v8, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1, s6 +; GFX11-NEXT: v_add_co_ci_u32_e64 v10, s3, 0, v12, s3 +; GFX11-NEXT: v_mul_lo_u32 v9, v17, v14 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s3, v17, v8, v[1:2] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s3, v11, v3, s3 +; GFX11-NEXT: v_mul_lo_u32 v11, v16, v15 +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s3, v27, v4, s3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s3, v10, v5, s3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s3, v24, v6, s3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v10, s3, v21, v11, s3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s3, v10, v9, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s3, v9, v22, s4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v20, s2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v18, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v26, s1 +; GFX11-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v25, vcc_lo ; GFX11-NEXT: v_add_nc_u32_e32 v7, v8, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = mul i256 %num, %den Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -662,28 +662,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 ; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100 -; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; GFX10-NEXT: v_mov_b32_e32 v4, 24 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_perm_b32 v3, v3, v1, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v1, v4, v1, 16 ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v2, v2, v3 clamp +; GFX10-NEXT: v_mov_b32_e32 v3, 8 ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp -; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_pk_ashrrev_i16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 24 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_saddsat_v4i8: @@ -1952,19 +1952,33 @@ ; GFX9-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_saddsat_v5i32: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s5 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v1, s1, s6 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v2, s2, s7 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v3, s3, s8 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v4, s4, s9 clamp -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_saddsat_v5i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_nc_i32 v0, s0, s5 clamp +; GFX10-NEXT: v_add_nc_i32 v1, s1, s6 clamp +; GFX10-NEXT: v_add_nc_i32 v2, s2, s7 clamp +; GFX10-NEXT: v_add_nc_i32 v3, s3, s8 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_add_nc_i32 v0, s4, s9 clamp +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_saddsat_v5i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_nc_i32 v0, s0, s5 clamp +; GFX11-NEXT: v_add_nc_i32 v1, s1, s6 clamp +; GFX11-NEXT: v_add_nc_i32 v2, s2, s7 clamp +; GFX11-NEXT: v_add_nc_i32 v3, s3, s8 clamp +; GFX11-NEXT: v_add_nc_i32 v4, s4, s9 clamp +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: v_readfirstlane_b32 s4, v4 +; GFX11-NEXT: ; return to shader part epilog %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) ret <5 x i32> %result } @@ -2243,8 +2257,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_add_nc_i32 v0, v0, v16 clamp +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX10-NEXT: v_add_nc_i32 v1, v1, v17 clamp ; GFX10-NEXT: v_add_nc_i32 v2, v2, v18 clamp ; GFX10-NEXT: v_add_nc_i32 v3, v3, v19 clamp @@ -2260,15 +2274,15 @@ ; GFX10-NEXT: v_add_nc_i32 v13, v13, v29 clamp ; GFX10-NEXT: v_add_nc_i32 v14, v14, v30 clamp ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_i32 v15, v15, v31 clamp +; GFX10-NEXT: v_add_nc_i32 v15, v15, v16 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_saddsat_v16i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: v_add_nc_i32 v0, v0, v16 clamp +; GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GFX11-NEXT: v_add_nc_i32 v1, v1, v17 clamp ; GFX11-NEXT: v_add_nc_i32 v2, v2, v18 clamp ; GFX11-NEXT: v_add_nc_i32 v3, v3, v19 clamp @@ -2284,7 +2298,7 @@ ; GFX11-NEXT: v_add_nc_i32 v13, v13, v29 clamp ; GFX11-NEXT: v_add_nc_i32 v14, v14, v30 clamp ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_nc_i32 v15, v15, v31 clamp +; GFX11-NEXT: v_add_nc_i32 v15, v15, v16 clamp ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result @@ -2575,41 +2589,77 @@ ; GFX9-NEXT: v_readfirstlane_b32 s15, v15 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_saddsat_v16i32: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s16 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v1, s1, s17 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v2, s2, s18 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v3, s3, s19 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v4, s4, s20 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v5, s5, s21 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v6, s6, s22 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v7, s7, s23 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v8, s8, s24 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v9, s9, s25 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v10, s10, s26 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v11, s11, s27 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v12, s12, s28 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v13, s13, s29 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v14, s14, s30 clamp -; GFX10PLUS-NEXT: v_add_nc_i32 v15, s15, s31 clamp -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s6, v6 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s7, v7 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s8, v8 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s9, v9 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s10, v10 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s11, v11 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s12, v12 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s13, v13 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s14, v14 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s15, v15 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_saddsat_v16i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_nc_i32 v0, s0, s16 clamp +; GFX10-NEXT: v_add_nc_i32 v1, s1, s17 clamp +; GFX10-NEXT: v_add_nc_i32 v2, s2, s18 clamp +; GFX10-NEXT: v_add_nc_i32 v3, s3, s19 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_add_nc_i32 v0, s4, s20 clamp +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_add_nc_i32 v1, s5, s21 clamp +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_add_nc_i32 v2, s6, s22 clamp +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_add_nc_i32 v3, s7, s23 clamp +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_add_nc_i32 v0, s8, s24 clamp +; GFX10-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-NEXT: v_add_nc_i32 v1, s9, s25 clamp +; GFX10-NEXT: v_readfirstlane_b32 s6, v2 +; GFX10-NEXT: v_add_nc_i32 v2, s10, s26 clamp +; GFX10-NEXT: v_readfirstlane_b32 s7, v3 +; GFX10-NEXT: v_add_nc_i32 v3, s11, s27 clamp +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_add_nc_i32 v0, s12, s28 clamp +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_add_nc_i32 v1, s13, s29 clamp +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_add_nc_i32 v2, s14, s30 clamp +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_add_nc_i32 v3, s15, s31 clamp +; GFX10-NEXT: v_readfirstlane_b32 s12, v0 +; GFX10-NEXT: v_readfirstlane_b32 s13, v1 +; GFX10-NEXT: v_readfirstlane_b32 s14, v2 +; GFX10-NEXT: v_readfirstlane_b32 s15, v3 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_saddsat_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_nc_i32 v0, s0, s16 clamp +; GFX11-NEXT: v_add_nc_i32 v1, s1, s17 clamp +; GFX11-NEXT: v_add_nc_i32 v2, s2, s18 clamp +; GFX11-NEXT: v_add_nc_i32 v3, s3, s19 clamp +; GFX11-NEXT: v_add_nc_i32 v4, s4, s20 clamp +; GFX11-NEXT: v_add_nc_i32 v5, s5, s21 clamp +; GFX11-NEXT: v_add_nc_i32 v6, s6, s22 clamp +; GFX11-NEXT: v_add_nc_i32 v7, s7, s23 clamp +; GFX11-NEXT: v_add_nc_i32 v8, s8, s24 clamp +; GFX11-NEXT: v_add_nc_i32 v9, s9, s25 clamp +; GFX11-NEXT: v_add_nc_i32 v10, s10, s26 clamp +; GFX11-NEXT: v_add_nc_i32 v11, s11, s27 clamp +; GFX11-NEXT: v_add_nc_i32 v12, s12, s28 clamp +; GFX11-NEXT: v_add_nc_i32 v13, s13, s29 clamp +; GFX11-NEXT: v_add_nc_i32 v14, s14, s30 clamp +; GFX11-NEXT: v_add_nc_i32 v15, s15, s31 clamp +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: v_readfirstlane_b32 s4, v4 +; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-NEXT: v_readfirstlane_b32 s7, v7 +; GFX11-NEXT: v_readfirstlane_b32 s8, v8 +; GFX11-NEXT: v_readfirstlane_b32 s9, v9 +; GFX11-NEXT: v_readfirstlane_b32 s10, v10 +; GFX11-NEXT: v_readfirstlane_b32 s11, v11 +; GFX11-NEXT: v_readfirstlane_b32 s12, v12 +; GFX11-NEXT: v_readfirstlane_b32 s13, v13 +; GFX11-NEXT: v_readfirstlane_b32 s14, v14 +; GFX11-NEXT: v_readfirstlane_b32 s15, v15 +; GFX11-NEXT: ; return to shader part epilog %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result } @@ -4218,11 +4268,11 @@ ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v0 ; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4430,12 +4480,12 @@ ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] -; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 -; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], v[2:3] +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v0 +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -4529,11 +4579,11 @@ ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 -; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v0 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -4609,12 +4659,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 -; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v0 +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4782,12 +4832,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] -; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 -; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], v[2:3] +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v0 +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog ; @@ -4855,11 +4905,11 @@ ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 -; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v0 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog ; @@ -4958,22 +5008,22 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo -; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] -; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 +; GFX10-NEXT: v_add_co_u32 v4, s5, v2, v6 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s5, v3, v7, s5 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12 -; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4 +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[4:5], v[2:3] +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, s4, 0x80000000, v0 +; GFX10-NEXT: v_add_co_u32 v3, s4, 0x80000000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_saddsat_v2i64: @@ -5464,20 +5514,20 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo -; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5] -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5] -; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo +; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], v[0:1] +; GFX10-NEXT: v_cmp_gt_i64_e64 s4, s[2:3], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: v_cmp_eq_u64_e64 s0, s[2:3], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, 0, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v2, v3, v2 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX10-NEXT: v_add_co_u32 v6, s0, 0x80000000, v3 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_add_co_u32 v6, s0, 0x80000000, v3 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -5621,20 +5671,20 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, v[4:5], v[0:1] ; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: s_and_b32 s0, 1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 ; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 @@ -5843,48 +5893,48 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1] -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v4, v12 -; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[0:1] +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[16:17], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[16:17], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v18, 0, vcc_lo +; GFX10-NEXT: v_add_co_u32 v3, s4, 0x80000000, v2 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7] ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, v12 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v6, v14, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v15, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15] -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[4:5] +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v17 -; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v3, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v5, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v18, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v19, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v4, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v12, 0, vcc_lo +; GFX10-NEXT: v_add_co_u32 v7, s4, 0x80000000, v6 +; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_saddsat_v2i128: @@ -6218,6 +6268,7 @@ ; GFX10-NEXT: s_addc_u32 s16, s2, s10 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] ; GFX10-NEXT: s_addc_u32 s17, s3, s11 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -6228,69 +6279,67 @@ ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: s_ashr_i32 s2, s17, 31 ; GFX10-NEXT: s_and_b32 s1, 1, s1 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 -; GFX10-NEXT: s_ashr_i32 s0, s17, 31 -; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_add_u32 s1, s2, 0x80000000 ; GFX10-NEXT: s_add_u32 s10, s4, s12 ; GFX10-NEXT: s_addc_u32 s11, s5, s13 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[10:11], s[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX10-NEXT: s_addc_u32 s12, s6, s14 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[10:11], s[4:5] ; GFX10-NEXT: s_addc_u32 s13, s7, s15 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_cmp_eq_u64 s[12:13], s[6:7] -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[12:13], s[6:7] -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[14:15], 0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_mov_b32 s3, s0 -; GFX10-NEXT: s_mov_b32 s2, s0 -; GFX10-NEXT: v_mov_b32_e32 v6, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX10-NEXT: s_and_b32 s4, 1, s8 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s3 +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[12:13], s[6:7] +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 -; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 -; GFX10-NEXT: s_and_b32 s5, 1, s5 -; GFX10-NEXT: v_mov_b32_e32 v7, s13 -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s3 +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[14:15], 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_ashr_i32 s4, s13, 31 +; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s3 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, 0, s4 -; GFX10-NEXT: v_mov_b32_e32 v5, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v3, v4, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v5, s10 -; GFX10-NEXT: s_ashr_i32 s0, s13, 31 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 -; GFX10-NEXT: s_mov_b32 s3, s0 -; GFX10-NEXT: s_mov_b32 s2, s0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s1, vcc_lo -; GFX10-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s1, v2 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v4 -; GFX10-NEXT: v_readfirstlane_b32 s4, v5 -; GFX10-NEXT: v_readfirstlane_b32 s5, v6 -; GFX10-NEXT: v_readfirstlane_b32 s6, v3 -; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_mov_b32 s7, s4 +; GFX10-NEXT: s_mov_b32 s6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, s2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, s9 +; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, s16 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s1, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s1, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, s10 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, s11 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, s12 +; GFX10-NEXT: v_readfirstlane_b32 s3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s5, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v2 +; GFX10-NEXT: v_readfirstlane_b32 s6, v1 +; GFX10-NEXT: v_readfirstlane_b32 s7, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_saddsat_v2i128: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -472,11 +472,11 @@ ; GFX10-NEXT: s_addc_u32 s9, s11, s12 ; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] -; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: s_xor_b64 s[10:11], s[0:1], s[2:3] ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX10-NEXT: s_sub_u32 s10, 0, s8 -; GFX10-NEXT: s_subb_u32 s11, 0, s9 +; GFX10-NEXT: s_sub_u32 s0, 0, s8 +; GFX10-NEXT: s_subb_u32 s1, 0, s9 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -487,130 +487,130 @@ ; GFX10-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX10-NEXT: v_mul_lo_u32 v4, s10, v2 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s14, s10, v3, 0 -; GFX10-NEXT: v_mul_lo_u32 v5, s11, v3 -; GFX10-NEXT: v_mul_hi_u32 v6, v2, v0 +; GFX10-NEXT: v_mul_lo_u32 v4, s0, v2 +; GFX10-NEXT: v_mul_lo_u32 v5, s1, v3 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s14, s0, v3, 0 ; GFX10-NEXT: v_add3_u32 v1, v1, v4, v5 ; GFX10-NEXT: v_mul_lo_u32 v4, v2, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0 ; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX10-NEXT: v_mul_lo_u32 v7, v2, v1 -; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX10-NEXT: v_mul_hi_u32 v1, v2, v1 ; GFX10-NEXT: v_add_co_u32 v4, s14, v4, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s14 -; GFX10-NEXT: v_add_co_u32 v6, s14, v7, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s14 -; GFX10-NEXT: v_add_co_u32 v0, s14, v4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s14 -; GFX10-NEXT: v_add_co_u32 v4, s14, v6, v8 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s14 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6 -; GFX10-NEXT: v_add_co_u32 v0, s14, v4, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, v2, v0 +; GFX10-NEXT: v_add_co_u32 v4, s14, v4, v5 +; GFX10-NEXT: v_mul_lo_u32 v5, v2, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s14 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 +; GFX10-NEXT: v_add_co_u32 v0, s14, v5, v0 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s14 +; GFX10-NEXT: v_mul_hi_u32 v1, v2, v1 +; GFX10-NEXT: v_add_co_u32 v0, s14, v0, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s14 +; GFX10-NEXT: v_add_co_u32 v0, s14, v0, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v6, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s14 ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v0 ; GFX10-NEXT: v_add3_u32 v1, v5, v4, v1 -; GFX10-NEXT: v_mul_lo_u32 v4, s11, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, s1, v3 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v1, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s14, s10, v3, 0 -; GFX10-NEXT: v_mul_lo_u32 v5, s10, v2 -; GFX10-NEXT: v_mul_hi_u32 v6, v2, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, s0, v2 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v3, 0 ; GFX10-NEXT: v_add3_u32 v1, v1, v5, v4 ; GFX10-NEXT: v_mul_lo_u32 v4, v2, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0 ; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX10-NEXT: v_mul_lo_u32 v7, v2, v1 -; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v5 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v0, v2, v0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v5 +; GFX10-NEXT: v_mul_lo_u32 v5, v2, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 +; GFX10-NEXT: v_add_co_u32 v0, s0, v5, v0 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX10-NEXT: v_add_co_u32 v4, s10, v4, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s10 -; GFX10-NEXT: v_add_co_u32 v6, s10, v7, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s10 -; GFX10-NEXT: v_add_co_u32 v0, s10, v4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s10 -; GFX10-NEXT: v_add_co_u32 v4, s10, v6, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6 -; GFX10-NEXT: v_add_co_u32 v0, s10, v4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v6, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v3, v0 ; GFX10-NEXT: v_add3_u32 v1, v5, v4, v1 -; GFX10-NEXT: v_mul_hi_u32 v4, s1, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v2, s1, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s0, v1 -; GFX10-NEXT: v_mul_lo_u32 v5, s1, v1 -; GFX10-NEXT: v_mul_hi_u32 v6, s0, v1 -; GFX10-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s10 -; GFX10-NEXT: v_add_co_u32 v4, s10, v5, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s10 -; GFX10-NEXT: v_add_co_u32 v0, s10, v2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s10 -; GFX10-NEXT: v_add_co_u32 v2, s10, v4, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s10 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v5, v4 -; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s10 +; GFX10-NEXT: v_mul_lo_u32 v2, s11, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, s10, v1 +; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v3 +; GFX10-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v0, s11, v0 +; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v3 +; GFX10-NEXT: v_mul_lo_u32 v3, s11, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v2 +; GFX10-NEXT: v_add_co_u32 v0, s0, v3, v0 +; GFX10-NEXT: v_mul_hi_u32 v3, s10, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v2, s0, v0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: v_mul_lo_u32 v4, s9, v2 -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, 1 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s10, s8, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s8, v2, 0 ; GFX10-NEXT: v_mul_lo_u32 v5, s8, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add3_u32 v1, v1, v5, v4 -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v6, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v8, s1, v1 -; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v0, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v11, s0, 0, v8, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v1 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s9, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v12, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v14, v13, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v10, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v12 -; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v9 -; GFX10-NEXT: s_xor_b64 s[8:9], s[2:3], s[12:13] -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v13, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s1 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_xor_b32_e32 v2, s8, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, s9, v3 -; GFX10-NEXT: v_xor_b32_e32 v5, s2, v0 -; GFX10-NEXT: v_xor_b32_e32 v6, s2, v1 -; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s9, v3, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v5, s2 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s2, v6, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, s10, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s11, v1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s8, v4 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s11, v1 +; GFX10-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, -1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v0, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, v4, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v8, s0, 0, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s8, v7 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v1, s0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v2, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s0, 0, v3, s0 +; GFX10-NEXT: v_add_co_u32 v11, s0, v1, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s0, 0, v10, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v12, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v10, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_xor_b32_e32 v0, s10, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, s11, v1 +; GFX10-NEXT: v_sub_co_u32 v0, s1, v0, s10 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s1, s11, v1, s1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s9, v6, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v1, vcc_lo, v7, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, 0, v0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v0, s0 +; GFX10-NEXT: v_xor_b32_e32 v1, s2, v1 +; GFX10-NEXT: v_xor_b32_e32 v3, s2, v0 +; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v1, s2 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s2, v3, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm %div = sdiv i64 %x, %y store i64 %div, ptr addrspace(1) %out0 @@ -774,72 +774,72 @@ ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s1, s10, 31 -; GFX10-NEXT: s_ashr_i32 s2, s11, 31 +; GFX10-NEXT: s_ashr_i32 s3, s11, 31 ; GFX10-NEXT: s_add_i32 s0, s10, s1 -; GFX10-NEXT: s_add_i32 s3, s11, s2 -; GFX10-NEXT: s_xor_b32 s10, s0, s1 -; GFX10-NEXT: s_xor_b32 s3, s3, s2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX10-NEXT: s_sub_i32 s0, 0, s10 -; GFX10-NEXT: s_sub_i32 s11, 0, s3 -; GFX10-NEXT: s_ashr_i32 s12, s9, 31 +; GFX10-NEXT: s_xor_b32 s2, s0, s1 +; GFX10-NEXT: s_add_i32 s0, s11, s3 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX10-NEXT: s_xor_b32 s10, s0, s3 +; GFX10-NEXT: s_sub_i32 s0, 0, s2 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10 +; GFX10-NEXT: s_ashr_i32 s11, s8, 31 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: s_xor_b32 s1, s11, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s11, v1 -; GFX10-NEXT: s_ashr_i32 s11, s8, 31 +; GFX10-NEXT: s_sub_i32 s0, 0, s10 +; GFX10-NEXT: v_mul_lo_u32 v3, s0, v1 ; GFX10-NEXT: s_add_i32 s0, s8, s11 -; GFX10-NEXT: s_add_i32 s8, s9, s12 +; GFX10-NEXT: s_ashr_i32 s8, s9, 31 ; GFX10-NEXT: s_xor_b32 s0, s0, s11 -; GFX10-NEXT: s_xor_b32 s8, s8, s12 +; GFX10-NEXT: s_add_i32 s9, s9, s8 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s1, s11, s1 +; GFX10-NEXT: s_xor_b32 s9, s9, s8 +; GFX10-NEXT: s_xor_b32 s3, s8, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s8, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, v0, s2 +; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s10 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s9, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s10, v3 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, s12, s2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 ; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3 +; GFX10-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s3, v1 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s10, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v1, s0 +; GFX10-NEXT: v_xor_b32_e32 v0, s11, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, s8, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s11, v0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s8, v1 +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i32> %x, %y store <2 x i32> %div, ptr addrspace(1) %out0 @@ -1133,139 +1133,139 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s0, s12, 31 -; GFX10-NEXT: s_ashr_i32 s1, s13, 31 -; GFX10-NEXT: s_ashr_i32 s2, s14, 31 -; GFX10-NEXT: s_ashr_i32 s3, s15, 31 -; GFX10-NEXT: s_add_i32 s6, s12, s0 -; GFX10-NEXT: s_add_i32 s7, s13, s1 -; GFX10-NEXT: s_add_i32 s12, s14, s2 -; GFX10-NEXT: s_add_i32 s13, s15, s3 -; GFX10-NEXT: s_xor_b32 s14, s6, s0 -; GFX10-NEXT: s_xor_b32 s15, s7, s1 -; GFX10-NEXT: s_xor_b32 s12, s12, s2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 -; GFX10-NEXT: s_xor_b32 s13, s13, s3 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12 -; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s13 +; GFX10-NEXT: s_ashr_i32 s3, s12, 31 +; GFX10-NEXT: s_ashr_i32 s6, s13, 31 +; GFX10-NEXT: s_add_i32 s0, s12, s3 +; GFX10-NEXT: s_ashr_i32 s7, s14, 31 +; GFX10-NEXT: s_xor_b32 s12, s0, s3 +; GFX10-NEXT: s_add_i32 s0, s13, s6 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX10-NEXT: s_xor_b32 s13, s0, s6 +; GFX10-NEXT: s_sub_i32 s0, 0, s12 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX10-NEXT: s_ashr_i32 s16, s15, 31 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s14 +; GFX10-NEXT: s_ashr_i32 s17, s8, 31 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX10-NEXT: s_sub_i32 s7, 0, s15 -; GFX10-NEXT: s_sub_i32 s19, 0, s12 -; GFX10-NEXT: s_ashr_i32 s16, s8, 31 -; GFX10-NEXT: s_ashr_i32 s17, s9, 31 -; GFX10-NEXT: s_ashr_i32 s18, s10, 31 +; GFX10-NEXT: s_xor_b32 s3, s17, s3 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: s_xor_b32 s20, s16, s0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: s_xor_b32 s21, s17, s1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0 +; GFX10-NEXT: s_add_i32 s0, s14, s7 +; GFX10-NEXT: s_xor_b32 s14, s0, s7 +; GFX10-NEXT: s_sub_i32 s0, 0, s13 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s14 +; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX10-NEXT: s_sub_i32 s0, 0, s14 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, s6, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s13 -; GFX10-NEXT: v_mul_lo_u32 v5, s7, v1 -; GFX10-NEXT: v_mul_lo_u32 v6, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, s6, v3 -; GFX10-NEXT: s_ashr_i32 s19, s11, 31 -; GFX10-NEXT: s_add_i32 s6, s8, s16 -; GFX10-NEXT: s_add_i32 s7, s9, s17 -; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 -; GFX10-NEXT: s_add_i32 s8, s10, s18 -; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6 -; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX10-NEXT: s_add_i32 s9, s11, s19 -; GFX10-NEXT: s_xor_b32 s10, s6, s16 -; GFX10-NEXT: s_xor_b32 s11, s7, s17 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 -; GFX10-NEXT: s_xor_b32 s8, s8, s18 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v6 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v7 -; GFX10-NEXT: s_xor_b32 s9, s9, s19 -; GFX10-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1 -; GFX10-NEXT: v_mul_hi_u32 v2, s8, v2 -; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3 -; GFX10-NEXT: s_xor_b32 s22, s18, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_mul_lo_u32 v4, v0, s14 -; GFX10-NEXT: v_mul_lo_u32 v5, v1, s15 -; GFX10-NEXT: v_mul_lo_u32 v6, v2, s12 -; GFX10-NEXT: v_mul_lo_u32 v7, v3, s13 -; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, s10, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, s11, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s8, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, s9, v7 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v5 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s15, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s12, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s13, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: v_mul_lo_u32 v2, s0, v3 +; GFX10-NEXT: s_add_i32 s0, s15, s16 +; GFX10-NEXT: s_xor_b32 s15, s0, s16 +; GFX10-NEXT: s_sub_i32 s0, 0, s15 +; GFX10-NEXT: v_mul_hi_u32 v2, v3, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s15 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, s0, v3 +; GFX10-NEXT: s_add_i32 s0, s8, s17 +; GFX10-NEXT: s_ashr_i32 s8, s9, 31 +; GFX10-NEXT: s_xor_b32 s0, s0, s17 +; GFX10-NEXT: s_xor_b32 s6, s8, s6 +; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX10-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v4 +; GFX10-NEXT: v_mul_lo_u32 v4, v0, s12 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, s0, v4 +; GFX10-NEXT: s_add_i32 s0, s9, s8 +; GFX10-NEXT: s_ashr_i32 s9, s10, 31 +; GFX10-NEXT: s_xor_b32 s0, s0, s8 +; GFX10-NEXT: s_add_i32 s1, s10, s9 +; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s12, v4 +; GFX10-NEXT: s_xor_b32 s1, s1, s9 +; GFX10-NEXT: s_ashr_i32 s10, s11, 31 +; GFX10-NEXT: v_mul_hi_u32 v2, s1, v2 +; GFX10-NEXT: s_add_i32 s2, s11, s10 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: s_xor_b32 s2, s2, s10 +; GFX10-NEXT: v_mul_lo_u32 v5, v1, s13 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_mul_hi_u32 v3, s2, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 1, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s0, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 +; GFX10-NEXT: v_mul_lo_u32 v6, v2, s14 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s1 +; GFX10-NEXT: v_mul_lo_u32 v7, v3, s15 +; GFX10-NEXT: v_sub_nc_u32_e32 v7, s2, v7 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v8, s2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s12, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s13, v5 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s12, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v8, s1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s15, v7 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v8, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s2 -; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v5 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s15, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s12, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s13, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1 -; GFX10-NEXT: s_xor_b32 s0, s19, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s2 -; GFX10-NEXT: v_xor_b32_e32 v0, s20, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s21, v1 -; GFX10-NEXT: v_xor_b32_e32 v2, s22, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, s0, v3 -; GFX10-NEXT: v_xor_b32_e32 v4, s16, v4 -; GFX10-NEXT: v_xor_b32_e32 v5, s17, v5 -; GFX10-NEXT: v_xor_b32_e32 v6, s18, v6 -; GFX10-NEXT: v_xor_b32_e32 v7, s19, v7 -; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s20, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s21, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s22, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s0, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v1 +; GFX10-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s3, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v8, s1 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v3 +; GFX10-NEXT: s_xor_b32 s3, s9, s7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s6, v1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_xor_b32_e32 v2, s3, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v8, s2 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s16, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s17, v5 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s18, v6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s19, v7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s3, v2 +; GFX10-NEXT: s_xor_b32 s3, s10, s16 +; GFX10-NEXT: v_xor_b32_e32 v3, s3, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s3, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] -; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s12, v4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s13, v5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s14, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s15, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v2, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s2 +; GFX10-NEXT: v_xor_b32_e32 v0, s17, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, s8, v1 +; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 +; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s17, v0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s8, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s10, v3 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7] ; GFX10-NEXT: s_endpgm %div = sdiv <4 x i32> %x, %y store <4 x i32> %div, ptr addrspace(1) %out0 @@ -1895,300 +1895,300 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s2, s9, 31 -; GFX10-NEXT: s_ashr_i32 s6, s13, 31 -; GFX10-NEXT: s_add_u32 s0, s8, s2 -; GFX10-NEXT: s_addc_u32 s1, s9, s2 -; GFX10-NEXT: s_add_u32 s8, s12, s6 -; GFX10-NEXT: s_mov_b32 s7, s6 -; GFX10-NEXT: s_addc_u32 s9, s13, s6 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] -; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX10-NEXT: s_sub_u32 s20, 0, s8 -; GFX10-NEXT: s_subb_u32 s21, 0, s9 -; GFX10-NEXT: s_ashr_i32 s12, s11, 31 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX10-NEXT: s_xor_b64 s[18:19], s[2:3], s[6:7] -; GFX10-NEXT: s_ashr_i32 s16, s15, 31 -; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GFX10-NEXT: s_add_u32 s6, s10, s12 -; GFX10-NEXT: s_addc_u32 s7, s11, s12 -; GFX10-NEXT: s_add_u32 s10, s14, s16 +; GFX10-NEXT: s_ashr_i32 s16, s9, 31 +; GFX10-NEXT: s_ashr_i32 s0, s13, 31 +; GFX10-NEXT: s_add_u32 s2, s8, s16 +; GFX10-NEXT: s_addc_u32 s3, s9, s16 +; GFX10-NEXT: s_add_u32 s6, s12, s0 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_addc_u32 s7, s13, s0 ; GFX10-NEXT: s_mov_b32 s17, s16 -; GFX10-NEXT: s_addc_u32 s11, s15, s16 -; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-NEXT: s_xor_b64 s[10:11], s[10:11], s[16:17] +; GFX10-NEXT: s_xor_b64 s[8:9], s[6:7], s[0:1] +; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX10-NEXT: s_sub_u32 s22, 0, s8 +; GFX10-NEXT: s_subb_u32 s23, 0, s9 +; GFX10-NEXT: s_ashr_i32 s12, s11, 31 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 +; GFX10-NEXT: s_xor_b64 s[18:19], s[16:17], s[0:1] +; GFX10-NEXT: s_ashr_i32 s20, s15, 31 +; GFX10-NEXT: s_add_u32 s0, s10, s12 +; GFX10-NEXT: s_addc_u32 s1, s11, s12 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_mov_b32 s21, s20 ; GFX10-NEXT: s_mov_b32 s13, s12 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s10 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_xor_b64 s[14:15], s[6:7], s[12:13] -; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GFX10-NEXT: v_trunc_f32_e32 v2, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GFX10-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-NEXT: v_mul_lo_u32 v7, s20, v5 -; GFX10-NEXT: v_trunc_f32_e32 v4, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX10-NEXT: v_mul_f32_e32 v2, 0xcf800000, v4 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s3, s20, v6, 0 -; GFX10-NEXT: v_mul_lo_u32 v8, s21, v6 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v4 -; GFX10-NEXT: s_sub_u32 s3, 0, s10 -; GFX10-NEXT: s_subb_u32 s6, 0, s11 -; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX10-NEXT: v_mul_lo_u32 v9, s3, v3 -; GFX10-NEXT: v_add3_u32 v7, v1, v7, v8 -; GFX10-NEXT: v_mul_lo_u32 v10, v5, v0 -; GFX10-NEXT: v_mul_hi_u32 v11, v6, v0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s7, s3, v4, 0 -; GFX10-NEXT: v_mul_lo_u32 v8, s6, v4 -; GFX10-NEXT: v_mul_lo_u32 v12, v6, v7 -; GFX10-NEXT: v_mul_hi_u32 v0, v5, v0 -; GFX10-NEXT: v_mul_lo_u32 v13, v5, v7 -; GFX10-NEXT: v_mul_hi_u32 v14, v6, v7 -; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX10-NEXT: v_add3_u32 v2, v2, v9, v8 -; GFX10-NEXT: v_add_co_u32 v10, s7, v10, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v0, s7, v13, v0 -; GFX10-NEXT: v_mul_lo_u32 v8, v3, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s7 -; GFX10-NEXT: v_mul_lo_u32 v15, v4, v2 -; GFX10-NEXT: v_add_co_u32 v10, s7, v10, v11 -; GFX10-NEXT: v_mul_hi_u32 v9, v4, v1 -; GFX10-NEXT: v_mul_hi_u32 v1, v3, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v0, s7, v0, v14 -; GFX10-NEXT: v_mul_lo_u32 v14, v3, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s7 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v12, v10 -; GFX10-NEXT: v_add_co_u32 v8, s7, v8, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s7 -; GFX10-NEXT: v_mul_hi_u32 v16, v4, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v13, v11 -; GFX10-NEXT: v_add_co_u32 v1, s7, v14, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v0, s7, v0, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v8, s7, v8, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v9, s7, v1, v16 -; GFX10-NEXT: v_add3_u32 v7, v11, v10, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v6, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v8, v12, v8 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo -; GFX10-NEXT: v_mul_hi_u32 v2, v3, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v13, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s7, s20, v6, 0 -; GFX10-NEXT: v_add_co_u32 v7, s7, v9, v8 -; GFX10-NEXT: v_mul_lo_u32 v9, s21, v6 -; GFX10-NEXT: v_mul_lo_u32 v11, s20, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s7 -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7 -; GFX10-NEXT: v_add3_u32 v2, v10, v8, v2 -; GFX10-NEXT: v_mul_lo_u32 v8, v5, v0 -; GFX10-NEXT: v_add3_u32 v7, v1, v11, v9 -; GFX10-NEXT: v_mul_hi_u32 v10, v6, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, v5, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v2, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v12, v6, v7 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s7, s3, v4, 0 -; GFX10-NEXT: v_mul_lo_u32 v9, s6, v4 -; GFX10-NEXT: v_mul_lo_u32 v11, s3, v3 -; GFX10-NEXT: v_mul_lo_u32 v13, v5, v7 -; GFX10-NEXT: v_mul_hi_u32 v14, v6, v7 -; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX10-NEXT: v_add_co_u32 v8, s3, v8, v12 -; GFX10-NEXT: v_mul_lo_u32 v15, v3, v1 -; GFX10-NEXT: v_mul_hi_u32 v16, v4, v1 -; GFX10-NEXT: v_add3_u32 v2, v2, v11, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v0, s3, v13, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v8, s3, v8, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v0, s3, v0, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s3 -; GFX10-NEXT: v_mul_lo_u32 v12, v4, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v8, v9, v8 -; GFX10-NEXT: v_mul_hi_u32 v1, v3, v1 -; GFX10-NEXT: v_mul_lo_u32 v13, v3, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v11, v10 -; GFX10-NEXT: v_mul_hi_u32 v9, v4, v2 -; GFX10-NEXT: v_add_co_u32 v0, s3, v0, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v11, s3, v15, v12 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0 -; GFX10-NEXT: v_add3_u32 v7, v10, v8, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v1, s3, v13, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo -; GFX10-NEXT: v_add_co_u32 v8, s3, v11, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v1, s3, v1, v9 -; GFX10-NEXT: v_mul_lo_u32 v7, s1, v0 -; GFX10-NEXT: v_mul_lo_u32 v9, s0, v5 -; GFX10-NEXT: v_mul_hi_u32 v10, s1, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v11, s1, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s3 -; GFX10-NEXT: v_add_nc_u32_e32 v8, v12, v8 -; GFX10-NEXT: v_mul_hi_u32 v12, s0, v5 -; GFX10-NEXT: v_mul_hi_u32 v5, s1, v5 -; GFX10-NEXT: v_add_co_u32 v7, s3, v7, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v10, s3, v11, v10 -; GFX10-NEXT: v_add_co_u32 v0, s6, v7, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3 -; GFX10-NEXT: v_add_co_u32 v10, s3, v10, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s3 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v9, v0 -; GFX10-NEXT: v_add_co_u32 v8, s3, v1, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s3 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v11 -; GFX10-NEXT: v_add_co_u32 v9, s3, v10, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 -; GFX10-NEXT: v_mul_hi_u32 v2, v3, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v13, v6 -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v8 -; GFX10-NEXT: v_add3_u32 v5, v7, v0, v5 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_mul_hi_u32 v8, s14, v4 -; GFX10-NEXT: v_add3_u32 v2, v6, v1, v2 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s3, s8, v9, 0 -; GFX10-NEXT: v_mul_lo_u32 v6, s9, v9 -; GFX10-NEXT: v_mul_lo_u32 v7, s8, v5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v2, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v3, s15, v4 -; GFX10-NEXT: v_mul_hi_u32 v4, s15, v4 -; GFX10-NEXT: v_mul_lo_u32 v10, s14, v2 -; GFX10-NEXT: v_mul_lo_u32 v11, s15, v2 -; GFX10-NEXT: v_add3_u32 v1, v1, v7, v6 -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v9, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v12, s1, v1 -; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, s0, v0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v14, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s9, v12, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v13, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s0, 0, v0, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v14 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s9, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v19, s0, v6, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v20, s0, 0, v7, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v18, v17, s0 -; GFX10-NEXT: v_add_co_u32 v1, s0, v3, v10 -; GFX10-NEXT: v_mul_hi_u32 v10, s14, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v11, v4 -; GFX10-NEXT: v_add_co_u32 v1, s1, v1, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX10-NEXT: v_trunc_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX10-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0 +; GFX10-NEXT: v_mul_lo_u32 v4, s22, v2 +; GFX10-NEXT: v_mul_lo_u32 v5, s23, v3 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s22, v3, 0 +; GFX10-NEXT: v_add3_u32 v1, v1, v4, v5 +; GFX10-NEXT: v_mul_lo_u32 v4, v2, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX10-NEXT: v_add_co_u32 v4, s6, v4, v5 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 +; GFX10-NEXT: v_mul_hi_u32 v0, v2, v0 +; GFX10-NEXT: v_add_co_u32 v4, s6, v4, v5 +; GFX10-NEXT: v_mul_lo_u32 v5, v2, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 +; GFX10-NEXT: v_add_co_u32 v0, s6, v5, v0 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v0, s6, v0, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v4, s6, v0, v4 +; GFX10-NEXT: v_mul_hi_u32 v0, v2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v6, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 +; GFX10-NEXT: s_add_u32 s6, s14, s20 +; GFX10-NEXT: s_addc_u32 s7, s15, s20 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4 +; GFX10-NEXT: s_xor_b64 s[10:11], s[6:7], s[20:21] +; GFX10-NEXT: v_add3_u32 v5, v5, v1, v0 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s11 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10 +; GFX10-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] +; GFX10-NEXT: s_sub_u32 s0, 0, s10 +; GFX10-NEXT: s_subb_u32 s1, 0, s11 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v4, s23, v3 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_lo_u32 v5, s22, v2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX10-NEXT: v_trunc_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_f32_e32 v6, 0xcf800000, v1 +; GFX10-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v1 +; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0 +; GFX10-NEXT: v_mul_lo_u32 v8, s0, v6 +; GFX10-NEXT: v_mul_lo_u32 v9, s1, v7 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s14, s0, v7, 0 +; GFX10-NEXT: v_add3_u32 v1, v1, v8, v9 +; GFX10-NEXT: v_mul_lo_u32 v8, v6, v0 +; GFX10-NEXT: v_mul_lo_u32 v9, v7, v1 +; GFX10-NEXT: v_add_co_u32 v8, s14, v8, v9 +; GFX10-NEXT: v_mul_hi_u32 v9, v7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s14 +; GFX10-NEXT: v_mul_hi_u32 v0, v6, v0 +; GFX10-NEXT: v_add_co_u32 v8, s14, v8, v9 +; GFX10-NEXT: v_mul_lo_u32 v9, v6, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s14 +; GFX10-NEXT: v_add_nc_u32_e32 v8, v10, v8 +; GFX10-NEXT: v_add_co_u32 v0, s14, v9, v0 +; GFX10-NEXT: v_mul_hi_u32 v9, v7, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s14 +; GFX10-NEXT: v_add_co_u32 v0, s14, v0, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s14 +; GFX10-NEXT: v_add_co_u32 v8, s14, v0, v8 +; GFX10-NEXT: v_mul_hi_u32 v0, v6, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v9, v10, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s14 +; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v7, v8 +; GFX10-NEXT: v_add3_u32 v9, v9, v1, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s14, s22, v3, 0 +; GFX10-NEXT: v_mul_lo_u32 v8, s1, v7 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v9, vcc_lo +; GFX10-NEXT: v_add3_u32 v1, v1, v5, v4 +; GFX10-NEXT: v_mul_lo_u32 v4, v2, v0 +; GFX10-NEXT: v_mul_lo_u32 v9, s0, v6 +; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX10-NEXT: v_add_co_u32 v4, s14, v4, v5 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s14 +; GFX10-NEXT: v_mul_hi_u32 v0, v2, v0 +; GFX10-NEXT: v_add_co_u32 v4, s14, v4, v5 +; GFX10-NEXT: v_mul_lo_u32 v5, v2, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s14 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v10, v4 +; GFX10-NEXT: v_add_co_u32 v0, s14, v5, v0 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s14 +; GFX10-NEXT: v_add_co_u32 v0, s14, v0, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s14 +; GFX10-NEXT: v_add_co_u32 v4, s14, v0, v4 +; GFX10-NEXT: v_mul_hi_u32 v0, v2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v10, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s14 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v1, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v7, 0 +; GFX10-NEXT: v_mul_lo_u32 v4, s3, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo +; GFX10-NEXT: v_add3_u32 v1, v1, v9, v8 +; GFX10-NEXT: v_mul_lo_u32 v8, v6, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX10-NEXT: v_mul_lo_u32 v9, v7, v1 +; GFX10-NEXT: v_add_co_u32 v8, s0, v8, v9 +; GFX10-NEXT: v_mul_hi_u32 v9, v7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v0, v6, v0 +; GFX10-NEXT: v_add_co_u32 v8, s0, v8, v9 +; GFX10-NEXT: v_mul_lo_u32 v9, v6, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v10 +; GFX10-NEXT: v_add_nc_u32_e32 v8, v10, v8 +; GFX10-NEXT: v_add_co_u32 v0, s0, v9, v0 +; GFX10-NEXT: v_mul_hi_u32 v9, v7, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v3, v1 -; GFX10-NEXT: v_mul_hi_u32 v2, s15, v2 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v8, v10 -; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v1 +; GFX10-NEXT: v_mul_hi_u32 v1, v6, v1 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v5 +; GFX10-NEXT: v_mul_hi_u32 v5, s2, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v9, v10, v9 +; GFX10-NEXT: v_mul_hi_u32 v3, s3, v3 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v7, v0 +; GFX10-NEXT: v_add3_u32 v1, v9, v8, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v5 +; GFX10-NEXT: v_mul_lo_u32 v5, s3, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v6, v1, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v4, v8, v4 +; GFX10-NEXT: v_add_co_u32 v3, s0, v5, v3 +; GFX10-NEXT: v_mul_hi_u32 v5, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v2, s3, v2 +; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_mul_lo_u32 v7, s9, v3 +; GFX10-NEXT: v_add3_u32 v2, v5, v4, v2 +; GFX10-NEXT: v_mul_lo_u32 v4, s7, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, s6, v1 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v5 +; GFX10-NEXT: v_mul_hi_u32 v5, s6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v5 +; GFX10-NEXT: v_mul_lo_u32 v5, s7, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 +; GFX10-NEXT: v_add_co_u32 v0, s0, v5, v0 +; GFX10-NEXT: v_mul_hi_u32 v5, s6, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v0, v4 +; GFX10-NEXT: v_mul_hi_u32 v0, s7, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v6, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: v_sub_co_u32 v8, s0, v12, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, s0, 0, v0, s0 -; GFX10-NEXT: v_add3_u32 v2, v3, v1, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v19, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v20, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s10, v4, 0 -; GFX10-NEXT: v_mul_lo_u32 v7, s10, v2 -; GFX10-NEXT: v_mul_lo_u32 v11, s11, v4 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v17 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 -; GFX10-NEXT: v_mov_b32_e32 v16, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v12, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo -; GFX10-NEXT: v_add3_u32 v1, v1, v7, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v10, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v8, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v8, s0, s14, v0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s1, s15, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s15, v1 +; GFX10-NEXT: v_mul_lo_u32 v6, s8, v2 +; GFX10-NEXT: v_mul_lo_u32 v17, s11, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v1, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s8, v3, 0 +; GFX10-NEXT: v_mul_lo_u32 v16, s10, v5 +; GFX10-NEXT: v_add3_u32 v1, v1, v6, v7 +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, s2, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v7, s0, s3, v1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s9, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, -1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v0, v8, s0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s3, v1 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s9, v0, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v6, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v11, s0, 0, v9, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v10 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s9, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, -1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v0, v1, s0 +; GFX10-NEXT: v_add_co_u32 v0, s0, v3, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v13, s0, 0, v2, s0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v0, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v13, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v0, v1, s0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s10, v4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v14, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 +; GFX10-NEXT: v_add3_u32 v1, v1, v16, v17 +; GFX10-NEXT: v_sub_co_u32 v8, s1, s6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s7, v1 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v14, s2, s7, v1, s1 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s1, s11, v0, s1 +; GFX10-NEXT: v_sub_co_u32 v17, s1, v8, s10 +; GFX10-NEXT: v_cmp_le_u32_e64 s6, s11, v14 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s2, 0, v16, s1 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s10, v17 +; GFX10-NEXT: v_cmp_le_u32_e64 s3, s11, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s11, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, -1, s3 +; GFX10-NEXT: v_cmp_le_u32_e64 s3, s10, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v1, s2 +; GFX10-NEXT: v_add_co_u32 v1, s2, v4, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v19, s2, 0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, -1, s3 +; GFX10-NEXT: v_add_co_u32 v20, s2, v1, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v21, s2, 0, v19, s2 +; GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, -1, s6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, s11, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v20, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v19, v21, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v22, s3 +; GFX10-NEXT: v_cmp_ne_u32_e64 s3, 0, v0 ; GFX10-NEXT: v_xor_b32_e32 v0, s18, v3 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v9 -; GFX10-NEXT: v_xor_b32_e32 v3, s19, v5 -; GFX10-NEXT: v_xor_b32_e32 v6, s2, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, vcc_lo, s11, v1, s0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v8, s10 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s0, 0, v10, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s18 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s19, v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v9 -; GFX10-NEXT: v_xor_b32_e32 v3, s2, v7 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, s11, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v14, s0, v4, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s0, 0, v2, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s0 -; GFX10-NEXT: v_add_co_u32 v11, s0, v14, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v15, s0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v7 -; GFX10-NEXT: v_sub_co_u32 v7, s0, v12, s10 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, s0, 0, v10, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v15, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v12, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v10, v4, v11, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v14, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v7, s0 -; GFX10-NEXT: s_xor_b64 s[0:1], s[12:13], s[16:17] -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v3, s2 -; GFX10-NEXT: v_xor_b32_e32 v3, s0, v10 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s2, v6, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v6, s1, v2 -; GFX10-NEXT: v_xor_b32_e32 v8, s12, v8 -; GFX10-NEXT: v_xor_b32_e32 v7, s12, v7 -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, s0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v8, s12 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v1, s3 +; GFX10-NEXT: v_xor_b32_e32 v1, s19, v2 +; GFX10-NEXT: v_sub_co_u32 v0, s6, v0, s18 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v13, s3 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s6, s19, v1, s6 +; GFX10-NEXT: s_xor_b64 s[6:7], s[12:13], s[20:21] +; GFX10-NEXT: v_xor_b32_e32 v3, s6, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, s7, v2 +; GFX10-NEXT: v_sub_co_u32 v2, s6, v3, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v3, s6, s7, v4, s6 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] -; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s9, v9, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v1, vcc_lo, v10, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, 0, v0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, vcc_lo, s11, v16, s1 +; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, v17, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v6, v1, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v17, v3, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v0, s0 +; GFX10-NEXT: v_xor_b32_e32 v1, s16, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v3, s3 +; GFX10-NEXT: v_xor_b32_e32 v5, s16, v0 +; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v1, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v14, v2, s3 +; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s16, v5, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v5, s12, v2 +; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, s12 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s12, v5, vcc_lo +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 @@ -2520,84 +2520,84 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80018 -; GFX10-NEXT: s_bfe_i32 s2, s0, 0x80010 -; GFX10-NEXT: s_ashr_i32 s3, s1, 31 -; GFX10-NEXT: s_ashr_i32 s8, s2, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_add_i32 s2, s2, s8 -; GFX10-NEXT: s_xor_b32 s1, s1, s3 -; GFX10-NEXT: s_xor_b32 s2, s2, s8 +; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80010 +; GFX10-NEXT: s_bfe_i32 s2, s0, 0x80018 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_ashr_i32 s3, s2, 31 +; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: s_xor_b32 s1, s1, s6 +; GFX10-NEXT: s_xor_b32 s2, s2, s3 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX10-NEXT: s_sub_i32 s6, 0, s1 -; GFX10-NEXT: s_sub_i32 s7, 0, s2 +; GFX10-NEXT: s_sub_i32 s7, 0, s1 +; GFX10-NEXT: s_sub_i32 s8, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1 -; GFX10-NEXT: s_sext_i32_i8 s6, s0 +; GFX10-NEXT: v_mul_lo_u32 v2, s7, v0 +; GFX10-NEXT: s_sext_i32_i8 s7, s0 +; GFX10-NEXT: v_mul_lo_u32 v3, s8, v1 +; GFX10-NEXT: s_ashr_i32 s9, s7, 31 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x80008 -; GFX10-NEXT: s_ashr_i32 s9, s6, 31 -; GFX10-NEXT: s_ashr_i32 s10, s0, 31 -; GFX10-NEXT: s_add_i32 s6, s6, s9 -; GFX10-NEXT: s_add_i32 s0, s0, s10 +; GFX10-NEXT: s_add_i32 s7, s7, s9 +; GFX10-NEXT: s_ashr_i32 s8, s0, 31 +; GFX10-NEXT: s_xor_b32 s7, s7, s9 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s0, s0, s10 -; GFX10-NEXT: s_xor_b32 s6, s6, s9 +; GFX10-NEXT: s_add_i32 s0, s0, s8 +; GFX10-NEXT: s_xor_b32 s0, s0, s8 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s6, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s2 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v3 +; GFX10-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, v0, s1 +; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s7, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s6, v3 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s7, s8, s3 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s2, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v3 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s2, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX10-NEXT: s_xor_b32 s1, s10, s3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 -; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 -; GFX10-NEXT: v_xor_b32_e32 v2, s10, v2 -; GFX10-NEXT: s_xor_b32 s0, s9, s8 -; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 -; GFX10-NEXT: v_xor_b32_e32 v3, s9, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s10, v2 -; GFX10-NEXT: s_movk_i32 s1, 0xff -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_and_b32_sdwa v0, v0, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s9, v3 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v2 +; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1 +; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s9, s6 +; GFX10-NEXT: s_movk_i32 s5, 0xff +; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_xor_b32_e32 v3, s8, v3 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s8, v3 +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v3, s5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_short v1, v0, s[4:5] -; GFX10-NEXT: global_store_short v1, v2, s[6:7] +; GFX10-NEXT: global_store_short v1, v0, s[0:1] +; GFX10-NEXT: global_store_short v1, v2, s[2:3] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i8> %x, %y store <2 x i8> %div, ptr addrspace(1) %out0 @@ -2928,79 +2928,79 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s2, s1 ; GFX10-NEXT: s_bfe_i32 s1, s1, 0x100010 -; GFX10-NEXT: s_ashr_i32 s3, s2, 31 -; GFX10-NEXT: s_ashr_i32 s8, s1, 31 -; GFX10-NEXT: s_add_i32 s2, s2, s3 -; GFX10-NEXT: s_add_i32 s1, s1, s8 -; GFX10-NEXT: s_xor_b32 s2, s2, s3 -; GFX10-NEXT: s_xor_b32 s1, s1, s8 +; GFX10-NEXT: s_ashr_i32 s6, s2, 31 +; GFX10-NEXT: s_ashr_i32 s7, s1, 31 +; GFX10-NEXT: s_add_i32 s2, s2, s6 +; GFX10-NEXT: s_add_i32 s1, s1, s7 +; GFX10-NEXT: s_xor_b32 s2, s2, s6 +; GFX10-NEXT: s_xor_b32 s1, s1, s7 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX10-NEXT: s_sub_i32 s6, 0, s2 -; GFX10-NEXT: s_sub_i32 s7, 0, s1 +; GFX10-NEXT: s_sub_i32 s3, 0, s2 +; GFX10-NEXT: s_sub_i32 s8, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1 -; GFX10-NEXT: s_sext_i32_i16 s6, s0 +; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 +; GFX10-NEXT: s_sext_i32_i16 s3, s0 +; GFX10-NEXT: v_mul_lo_u32 v3, s8, v1 +; GFX10-NEXT: s_ashr_i32 s9, s3, 31 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX10-NEXT: s_ashr_i32 s9, s6, 31 -; GFX10-NEXT: s_ashr_i32 s10, s0, 31 -; GFX10-NEXT: s_add_i32 s6, s6, s9 -; GFX10-NEXT: s_add_i32 s0, s0, s10 +; GFX10-NEXT: s_add_i32 s3, s3, s9 +; GFX10-NEXT: s_ashr_i32 s8, s0, 31 +; GFX10-NEXT: s_xor_b32 s3, s3, s9 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s6, s6, s9 -; GFX10-NEXT: s_xor_b32 s0, s0, s10 +; GFX10-NEXT: s_add_i32 s0, s0, s8 +; GFX10-NEXT: s_xor_b32 s0, s0, s8 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v3 +; GFX10-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, v0, s2 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v3 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 -; GFX10-NEXT: s_xor_b32 s1, s9, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, s10, s8 -; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v3 ; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s9, s6 +; GFX10-NEXT: s_xor_b32 s5, s8, s7 +; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX10-NEXT: v_xor_b32_e32 v3, s8, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s10, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s8, v3 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: global_store_dword v1, v2, s[6:7] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v2, s[2:3] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i16> %x, %y store <2 x i16> %div, ptr addrspace(1) %out0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -1444,21 +1444,37 @@ ; GFX9-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_sext_inreg_i65_22: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] -; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 10, v1 -; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1] -; GFX10PLUS-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX10PLUS-NEXT: v_bfe_u32 v1, v1, 0, 10 -; GFX10PLUS-NEXT: v_bfe_i32 v2, v2, 0, 1 -; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v4, 10, v2 -; GFX10PLUS-NEXT: v_ashrrev_i64 v[2:3], 22, v[2:3] -; GFX10PLUS-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_sext_inreg_i65_22: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 10, v1 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1] +; GFX10-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX10-NEXT: v_bfe_u32 v1, v1, 0, 10 +; GFX10-NEXT: v_bfe_i32 v2, v2, 0, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 10, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_ashrrev_i64 v[2:3], 22, v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_sext_inreg_i65_22: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 10, v1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1] +; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-NEXT: v_bfe_u32 v1, v1, 0, 10 +; GFX11-NEXT: v_bfe_i32 v2, v2, 0, 1 +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 10, v2 +; GFX11-NEXT: v_ashrrev_i64 v[2:3], 22, v[2:3] +; GFX11-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl i65 %value, 22 %ashr = ashr i65 %shl, 22 ret i65 %ashr @@ -1501,18 +1517,31 @@ ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_sext_inreg_i65_33: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v1 -; GFX10PLUS-NEXT: v_bfe_i32 v1, v2, 0, 1 -; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] -; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 1, v2 -; GFX10PLUS-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_sext_inreg_i65_33: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_bfe_i32 v0, v2, 0, 1 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 1, v1 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] +; GFX10-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_sext_inreg_i65_33: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: v_bfe_i32 v1, v2, 0, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 1, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl i65 %value, 33 %ashr = ashr i65 %value, 33 ret i65 %ashr Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -497,8 +497,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, v16, v0 +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, v17, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, v18, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, v19, v3 @@ -514,15 +514,15 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v13, v29, v13 ; GFX10-NEXT: v_lshlrev_b32_e32 v14, v30, v14 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v15, v31, v15 +; GFX10-NEXT: v_lshlrev_b32_e32 v15, v16, v15 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_shl_v16i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, v16, v0 +; GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, v17, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v18, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, v19, v3 @@ -538,7 +538,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v13, v29, v13 ; GFX11-NEXT: v_lshlrev_b32_e32 v14, v30, v14 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v15, v31, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, v16, v15 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = shl <16 x i32> %value, %amount ret <16 x i32> %result @@ -1690,19 +1690,19 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, 64, v3 -; GFX10-NEXT: v_lshlrev_b64 v[4:5], v3, v[2:3] -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, 64, v3 ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 -; GFX10-NEXT: v_lshrrev_b64 v[5:6], v6, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1] -; GFX10-NEXT: v_or_b32_e32 v1, v5, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v3 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[5:6], v3, v[2:3] +; GFX10-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, 64, v3 +; GFX10-NEXT: v_lshlrev_b64 v[4:5], v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v2, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_shl_i65: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -662,28 +662,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 ; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100 -; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; GFX10-NEXT: v_mov_b32_e32 v4, 24 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_perm_b32 v3, v3, v1, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v1, v4, v1, 16 ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v2, v2, v3 clamp +; GFX10-NEXT: v_mov_b32_e32 v3, 8 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp -; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_pk_ashrrev_i16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 24 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_ssubsat_v4i8: @@ -1938,19 +1938,33 @@ ; GFX9-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_ssubsat_v5i32: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s5 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v1, s1, s6 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v2, s2, s7 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v3, s3, s8 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v4, s4, s9 clamp -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_ssubsat_v5i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s5 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, s1, s6 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, s2, s7 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, s3, s8 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_sub_nc_i32 v0, s4, s9 clamp +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_ssubsat_v5i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_sub_nc_i32 v0, s0, s5 clamp +; GFX11-NEXT: v_sub_nc_i32 v1, s1, s6 clamp +; GFX11-NEXT: v_sub_nc_i32 v2, s2, s7 clamp +; GFX11-NEXT: v_sub_nc_i32 v3, s3, s8 clamp +; GFX11-NEXT: v_sub_nc_i32 v4, s4, s9 clamp +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: v_readfirstlane_b32 s4, v4 +; GFX11-NEXT: ; return to shader part epilog %result = call <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) ret <5 x i32> %result } @@ -2229,8 +2243,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_sub_nc_i32 v0, v0, v16 clamp +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX10-NEXT: v_sub_nc_i32 v1, v1, v17 clamp ; GFX10-NEXT: v_sub_nc_i32 v2, v2, v18 clamp ; GFX10-NEXT: v_sub_nc_i32 v3, v3, v19 clamp @@ -2246,15 +2260,15 @@ ; GFX10-NEXT: v_sub_nc_i32 v13, v13, v29 clamp ; GFX10-NEXT: v_sub_nc_i32 v14, v14, v30 clamp ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_i32 v15, v15, v31 clamp +; GFX10-NEXT: v_sub_nc_i32 v15, v15, v16 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_ssubsat_v16i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: v_sub_nc_i32 v0, v0, v16 clamp +; GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GFX11-NEXT: v_sub_nc_i32 v1, v1, v17 clamp ; GFX11-NEXT: v_sub_nc_i32 v2, v2, v18 clamp ; GFX11-NEXT: v_sub_nc_i32 v3, v3, v19 clamp @@ -2270,7 +2284,7 @@ ; GFX11-NEXT: v_sub_nc_i32 v13, v13, v29 clamp ; GFX11-NEXT: v_sub_nc_i32 v14, v14, v30 clamp ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_nc_i32 v15, v15, v31 clamp +; GFX11-NEXT: v_sub_nc_i32 v15, v15, v16 clamp ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result @@ -2561,41 +2575,77 @@ ; GFX9-NEXT: v_readfirstlane_b32 s15, v15 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_ssubsat_v16i32: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s16 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v1, s1, s17 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v2, s2, s18 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v3, s3, s19 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v4, s4, s20 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v5, s5, s21 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v6, s6, s22 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v7, s7, s23 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v8, s8, s24 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v9, s9, s25 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v10, s10, s26 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v11, s11, s27 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v12, s12, s28 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v13, s13, s29 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v14, s14, s30 clamp -; GFX10PLUS-NEXT: v_sub_nc_i32 v15, s15, s31 clamp -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s6, v6 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s7, v7 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s8, v8 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s9, v9 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s10, v10 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s11, v11 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s12, v12 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s13, v13 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s14, v14 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s15, v15 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_ssubsat_v16i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s16 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, s1, s17 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, s2, s18 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, s3, s19 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_sub_nc_i32 v0, s4, s20 clamp +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_sub_nc_i32 v1, s5, s21 clamp +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_sub_nc_i32 v2, s6, s22 clamp +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_sub_nc_i32 v3, s7, s23 clamp +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_sub_nc_i32 v0, s8, s24 clamp +; GFX10-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-NEXT: v_sub_nc_i32 v1, s9, s25 clamp +; GFX10-NEXT: v_readfirstlane_b32 s6, v2 +; GFX10-NEXT: v_sub_nc_i32 v2, s10, s26 clamp +; GFX10-NEXT: v_readfirstlane_b32 s7, v3 +; GFX10-NEXT: v_sub_nc_i32 v3, s11, s27 clamp +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_sub_nc_i32 v0, s12, s28 clamp +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_sub_nc_i32 v1, s13, s29 clamp +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_sub_nc_i32 v2, s14, s30 clamp +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_sub_nc_i32 v3, s15, s31 clamp +; GFX10-NEXT: v_readfirstlane_b32 s12, v0 +; GFX10-NEXT: v_readfirstlane_b32 s13, v1 +; GFX10-NEXT: v_readfirstlane_b32 s14, v2 +; GFX10-NEXT: v_readfirstlane_b32 s15, v3 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_ssubsat_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_sub_nc_i32 v0, s0, s16 clamp +; GFX11-NEXT: v_sub_nc_i32 v1, s1, s17 clamp +; GFX11-NEXT: v_sub_nc_i32 v2, s2, s18 clamp +; GFX11-NEXT: v_sub_nc_i32 v3, s3, s19 clamp +; GFX11-NEXT: v_sub_nc_i32 v4, s4, s20 clamp +; GFX11-NEXT: v_sub_nc_i32 v5, s5, s21 clamp +; GFX11-NEXT: v_sub_nc_i32 v6, s6, s22 clamp +; GFX11-NEXT: v_sub_nc_i32 v7, s7, s23 clamp +; GFX11-NEXT: v_sub_nc_i32 v8, s8, s24 clamp +; GFX11-NEXT: v_sub_nc_i32 v9, s9, s25 clamp +; GFX11-NEXT: v_sub_nc_i32 v10, s10, s26 clamp +; GFX11-NEXT: v_sub_nc_i32 v11, s11, s27 clamp +; GFX11-NEXT: v_sub_nc_i32 v12, s12, s28 clamp +; GFX11-NEXT: v_sub_nc_i32 v13, s13, s29 clamp +; GFX11-NEXT: v_sub_nc_i32 v14, s14, s30 clamp +; GFX11-NEXT: v_sub_nc_i32 v15, s15, s31 clamp +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: v_readfirstlane_b32 s4, v4 +; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-NEXT: v_readfirstlane_b32 s7, v7 +; GFX11-NEXT: v_readfirstlane_b32 s8, v8 +; GFX11-NEXT: v_readfirstlane_b32 s9, v9 +; GFX11-NEXT: v_readfirstlane_b32 s10, v10 +; GFX11-NEXT: v_readfirstlane_b32 s11, v11 +; GFX11-NEXT: v_readfirstlane_b32 s12, v12 +; GFX11-NEXT: v_readfirstlane_b32 s13, v13 +; GFX11-NEXT: v_readfirstlane_b32 s14, v14 +; GFX11-NEXT: v_readfirstlane_b32 s15, v15 +; GFX11-NEXT: ; return to shader part epilog %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result } @@ -4204,11 +4254,11 @@ ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v0 ; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4416,12 +4466,12 @@ ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 -; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], v[2:3] +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v0 +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -4515,11 +4565,11 @@ ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 -; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v0 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -4595,12 +4645,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 -; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v0 +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4768,12 +4818,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 -; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], v[2:3] +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v0 +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog ; @@ -4841,11 +4891,11 @@ ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 -; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v0 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog ; @@ -4944,22 +4994,22 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo -; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5] -; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 +; GFX10-NEXT: v_sub_co_u32 v4, s5, v2, v6 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s5, v3, v7, s5 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12 -; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4 +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[4:5], v[2:3] +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, s4, 0x80000000, v0 +; GFX10-NEXT: v_add_co_u32 v3, s4, 0x80000000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_ssubsat_v2i64: @@ -5506,23 +5556,23 @@ ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, s0, v0 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo ; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, 0, v[0:1] ; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo -; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1] -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10-NEXT: v_cmp_gt_i64_e64 s4, s[2:3], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: v_cmp_eq_u64_e64 s0, s[2:3], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 -; GFX10-NEXT: v_xor_b32_e32 v0, v0, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v2, s0 +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo @@ -5674,24 +5724,24 @@ ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[4:5], v[0:1] ; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 +; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_and_b32 s3, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX10-NEXT: s_and_b32 s0, 1, s4 -; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s3 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo @@ -5910,54 +5960,54 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_sub_co_u32 v16, vcc_lo, v0, v8 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1] -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9] -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, 0, v[8:9] +; GFX10-NEXT: v_sub_co_ci_u32_e32 v8, vcc_lo, v2, v10, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v3, v11, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e32 v21, vcc_lo, v7, v15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[16:17], v[0:1] +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[8:9], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[8:9], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v19, v18, vcc_lo +; GFX10-NEXT: v_add_co_u32 v3, s4, 0x80000000, v2 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, 0, v[12:13] ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7] ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13] -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, v6, v14, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v7, v15, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[14:15] -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[6:7] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v21 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[4:5] +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v19 -; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v3, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v5, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v20, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v21, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v4, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v13, v12, vcc_lo +; GFX10-NEXT: v_add_co_u32 v7, s4, 0x80000000, v6 +; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_ssubsat_v2i128: @@ -5969,32 +6019,32 @@ ; GFX11-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo ; GFX11-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1] +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, 0, v[8:9] +; GFX11-NEXT: v_sub_co_u32 v8, s1, v4, v12 +; GFX11-NEXT: v_sub_co_ci_u32_e64 v9, s1, v5, v13, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[10:11] ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9] -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12 -; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo -; GFX11-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo -; GFX11-NEXT: v_sub_co_ci_u32_e32 v21, vcc_lo, v7, v15, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5] -; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0, v[10:11] +; GFX11-NEXT: v_sub_co_ci_u32_e64 v10, s1, v6, v14, s1 +; GFX11-NEXT: v_sub_co_ci_u32_e64 v11, s1, v7, v15, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, v2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7] +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[6:7] +; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13] ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[14:15] ; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[6:7] -; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v21 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[10:11], v[6:7] +; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v11 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v19 @@ -6011,8 +6061,8 @@ ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v5 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v20, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v21, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v10, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v11, v7, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result @@ -6307,83 +6357,82 @@ ; GFX10-NEXT: s_subb_u32 s18, s2, s10 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] ; GFX10-NEXT: s_subb_u32 s19, s3, s11 +; GFX10-NEXT: v_cmp_gt_u64_e64 s1, s[8:9], 0 ; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] -; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 -; GFX10-NEXT: s_and_b32 s1, 1, s1 -; GFX10-NEXT: s_ashr_i32 s0, s19, 31 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_ashr_i32 s2, s19, 31 +; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[10:11], 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX10-NEXT: s_add_u32 s1, s2, 0x80000000 ; GFX10-NEXT: s_sub_u32 s8, s4, s12 ; GFX10-NEXT: s_subb_u32 s9, s5, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[8:9], s[4:5] ; GFX10-NEXT: s_subb_u32 s10, s6, s14 +; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[8:9], s[4:5] ; GFX10-NEXT: s_subb_u32 s11, s7, s15 -; GFX10-NEXT: s_mov_b32 s3, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[10:11], s[6:7] -; GFX10-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s3 +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[10:11], s[6:7] +; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, s16 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX10-NEXT: s_and_b32 s4, 1, s16 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 -; GFX10-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 -; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s3 +; GFX10-NEXT: v_cmp_gt_u64_e64 s3, s[12:13], 0 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_and_b32 s5, 1, s5 -; GFX10-NEXT: s_mov_b32 s2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 -; GFX10-NEXT: v_mov_b32_e32 v7, s11 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc_lo +; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s3 +; GFX10-NEXT: v_cmp_gt_i64_e64 s3, s[14:15], 0 +; GFX10-NEXT: s_ashr_i32 s4, s11, 31 +; GFX10-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX10-NEXT: s_mov_b32 s7, s4 +; GFX10-NEXT: s_mov_b32 s6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s3 +; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, s16 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: v_mov_b32_e32 v5, s19 -; GFX10-NEXT: v_mov_b32_e32 v6, s9 -; GFX10-NEXT: v_xor_b32_e32 v3, v4, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s1, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_mov_b32_e32 v5, s8 -; GFX10-NEXT: s_ashr_i32 s0, s11, 31 -; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, s10 -; GFX10-NEXT: s_mov_b32 s3, s0 -; GFX10-NEXT: s_mov_b32 s2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s1, vcc_lo -; GFX10-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s1, v2 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v4 -; GFX10-NEXT: v_readfirstlane_b32 s4, v5 -; GFX10-NEXT: v_readfirstlane_b32 s5, v6 -; GFX10-NEXT: v_readfirstlane_b32 s6, v3 -; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, s2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, s17 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s19 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s1, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s1, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, s10 +; GFX10-NEXT: v_readfirstlane_b32 s3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s5, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v2 +; GFX10-NEXT: v_readfirstlane_b32 s6, v1 +; GFX10-NEXT: v_readfirstlane_b32 s7, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_ssubsat_v2i128: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -182,55 +182,55 @@ ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_bfe_u32 s0, 8, 0x100000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s3, s4, 0x100000 -; GFX10-NEXT: s_lshr_b32 s1, s4, 16 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_lshr_b32 s2, s5, 16 -; GFX10-NEXT: s_bfe_u32 s4, s5, 0x100000 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: s_bfe_u32 s3, s4, 0x100000 +; GFX10-NEXT: s_lshr_b32 s1, s4, 16 +; GFX10-NEXT: s_lshr_b32 s2, s5, 16 ; GFX10-NEXT: s_lshr_b32 s3, s3, s0 -; GFX10-NEXT: s_lshr_b32 s5, s6, 16 -; GFX10-NEXT: s_bfe_u32 s8, s6, 0x100000 -; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s1, s0 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: s_lshr_b32 s1, s4, s0 -; GFX10-NEXT: s_lshr_b32 s4, s2, s0 -; GFX10-NEXT: v_mov_b32_e32 v6, s3 -; GFX10-NEXT: v_mov_b32_e32 v7, s6 -; GFX10-NEXT: v_mov_b32_e32 v5, s2 -; GFX10-NEXT: s_lshr_b32 s2, s8, s0 -; GFX10-NEXT: v_mov_b32_e32 v8, s1 -; GFX10-NEXT: v_mov_b32_e32 v9, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: ds_write_b8 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_bfe_u32 s4, s5, 0x100000 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 -; GFX10-NEXT: ds_write_b8 v1, v4 offset:2 -; GFX10-NEXT: ds_write_b8 v1, v6 offset:1 -; GFX10-NEXT: ds_write_b8 v1, v7 offset:3 -; GFX10-NEXT: ds_write_b8 v1, v8 offset:5 -; GFX10-NEXT: ds_write_b8 v1, v5 offset:6 -; GFX10-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-NEXT: v_mov_b32_e32 v10, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: s_lshr_b32 s8, s1, s0 +; GFX10-NEXT: s_lshr_b32 s1, s4, s0 +; GFX10-NEXT: s_lshr_b32 s4, s2, s0 +; GFX10-NEXT: ds_write_b8 v1, v3 offset:2 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: ds_write_b8 v1, v0 offset:6 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: s_bfe_u32 s5, s6, 0x100000 +; GFX10-NEXT: ds_write_b8 v1, v2 offset:1 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: s_lshr_b32 s1, s5, s0 -; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 -; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 -; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 -; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX10-NEXT: s_lshr_b32 s2, s6, 16 +; GFX10-NEXT: ds_write_b8 v1, v3 offset:3 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: ds_write_b8 v1, v0 offset:5 +; GFX10-NEXT: ds_write_b8 v1, v2 offset:7 ; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_lshr_b32 s1, s2, s0 +; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 +; GFX10-NEXT: ds_write_b8 v1, v0 offset:9 +; GFX10-NEXT: ds_write_b8 v1, v2 offset:10 +; GFX10-NEXT: v_mov_b32_e32 v0, s7 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: s_bfe_u32 s1, s7, 0x100000 ; GFX10-NEXT: s_lshr_b32 s2, s7, 16 ; GFX10-NEXT: s_lshr_b32 s1, s1, s0 -; GFX10-NEXT: v_mov_b32_e32 v2, s7 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: ds_write_b8 v1, v0 offset:12 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-NEXT: s_lshr_b32 s0, s2, s0 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: ds_write_b8 v1, v0 offset:11 -; GFX10-NEXT: ds_write_b8 v1, v2 offset:12 -; GFX10-NEXT: ds_write_b8 v1, v3 offset:13 -; GFX10-NEXT: ds_write_b8 v1, v4 offset:14 -; GFX10-NEXT: ds_write_b8 v1, v5 offset:15 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: ds_write_b8 v1, v3 offset:11 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: ds_write_b8 v1, v0 offset:13 +; GFX10-NEXT: ds_write_b8 v1, v2 offset:14 +; GFX10-NEXT: ds_write_b8 v1, v3 offset:15 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align1: @@ -360,21 +360,21 @@ ; GFX10-NEXT: s_lshr_b32 s0, s4, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: s_lshr_b32 s1, s5, 16 -; GFX10-NEXT: s_lshr_b32 s2, s6, 16 -; GFX10-NEXT: s_lshr_b32 s3, s7, 16 -; GFX10-NEXT: v_mov_b32_e32 v4, s7 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: v_mov_b32_e32 v7, s2 -; GFX10-NEXT: v_mov_b32_e32 v8, s3 ; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s7 ; GFX10-NEXT: ds_write_b16 v1, v2 offset:4 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_lshr_b32 s2, s6, 16 +; GFX10-NEXT: s_lshr_b32 s3, s7, 16 ; GFX10-NEXT: ds_write_b16 v1, v3 offset:8 -; GFX10-NEXT: ds_write_b16 v1, v4 offset:12 -; GFX10-NEXT: ds_write_b16 v1, v5 offset:2 -; GFX10-NEXT: ds_write_b16 v1, v6 offset:6 -; GFX10-NEXT: ds_write_b16 v1, v7 offset:10 -; GFX10-NEXT: ds_write_b16 v1, v8 offset:14 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: ds_write_b16 v1, v0 offset:12 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: ds_write_b16 v1, v2 offset:2 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: ds_write_b16 v1, v3 offset:6 +; GFX10-NEXT: ds_write_b16 v1, v0 offset:10 +; GFX10-NEXT: ds_write_b16 v1, v2 offset:14 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align2: @@ -445,9 +445,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: v_mov_b32_e32 v4, s7 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 -; GFX10-NEXT: ds_write2_b32 v1, v3, v4 offset0:2 offset1:3 +; GFX10-NEXT: v_mov_b32_e32 v0, s7 +; GFX10-NEXT: ds_write2_b32 v1, v3, v0 offset0:2 offset1:3 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align4: @@ -504,9 +504,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: v_mov_b32_e32 v4, s7 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 -; GFX10-NEXT: ds_write2_b32 v1, v3, v4 offset0:2 offset1:3 +; GFX10-NEXT: v_mov_b32_e32 v0, s7 +; GFX10-NEXT: ds_write2_b32 v1, v3, v0 offset0:2 offset1:3 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align8: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -156,42 +156,42 @@ ; GFX10-NEXT: s_bfe_u32 s0, 8, 0x100000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-NEXT: s_bfe_u32 s3, s4, 0x100000 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_lshr_b32 s2, s5, 16 -; GFX10-NEXT: s_bfe_u32 s4, s5, 0x100000 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s5, s6, 16 -; GFX10-NEXT: s_bfe_u32 s7, s6, 0x100000 -; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s1, s0 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: s_lshr_b32 s1, s4, s0 -; GFX10-NEXT: s_lshr_b32 s4, s2, s0 +; GFX10-NEXT: s_bfe_u32 s3, s4, 0x100000 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: s_lshr_b32 s2, s5, 16 ; GFX10-NEXT: s_lshr_b32 s3, s3, s0 -; GFX10-NEXT: v_mov_b32_e32 v5, s2 -; GFX10-NEXT: s_lshr_b32 s2, s7, s0 -; GFX10-NEXT: v_mov_b32_e32 v9, s4 -; GFX10-NEXT: v_mov_b32_e32 v6, s3 -; GFX10-NEXT: v_mov_b32_e32 v7, s6 -; GFX10-NEXT: v_mov_b32_e32 v8, s1 +; GFX10-NEXT: s_lshr_b32 s7, s1, s0 ; GFX10-NEXT: ds_write_b8 v1, v0 -; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 -; GFX10-NEXT: ds_write_b8 v1, v4 offset:2 -; GFX10-NEXT: ds_write_b8 v1, v5 offset:6 -; GFX10-NEXT: ds_write_b8 v1, v6 offset:1 -; GFX10-NEXT: ds_write_b8 v1, v7 offset:3 -; GFX10-NEXT: ds_write_b8 v1, v8 offset:5 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: ds_write_b8 v1, v3 offset:2 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: s_lshr_b32 s1, s4, s0 +; GFX10-NEXT: s_lshr_b32 s4, s2, s0 +; GFX10-NEXT: ds_write_b8 v1, v0 offset:6 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: ds_write_b8 v1, v2 offset:1 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_bfe_u32 s1, s6, 0x100000 +; GFX10-NEXT: ds_write_b8 v1, v3 offset:3 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: s_lshr_b32 s5, s6, 16 +; GFX10-NEXT: s_lshr_b32 s1, s1, s0 +; GFX10-NEXT: ds_write_b8 v1, v0 offset:5 +; GFX10-NEXT: ds_write_b8 v1, v2 offset:7 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-NEXT: s_lshr_b32 s0, s5, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:9 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:10 -; GFX10-NEXT: ds_write_b8 v1, v4 offset:11 +; GFX10-NEXT: ds_write_b8 v1, v3 offset:11 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align1: @@ -293,20 +293,20 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_lshr_b32 s0, s4, 16 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s1, s5, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GFX10-NEXT: s_lshr_b32 s1, s5, 16 ; GFX10-NEXT: s_lshr_b32 s2, s6, 16 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v5, s1 -; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: ds_write_b16 v1, v2 offset:4 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-NEXT: ds_write_b16 v1, v3 offset:8 -; GFX10-NEXT: ds_write_b16 v1, v4 offset:2 -; GFX10-NEXT: ds_write_b16 v1, v5 offset:6 -; GFX10-NEXT: ds_write_b16 v1, v6 offset:10 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: ds_write_b16 v1, v0 offset:2 +; GFX10-NEXT: ds_write_b16 v1, v2 offset:6 +; GFX10-NEXT: ds_write_b16 v1, v3 offset:10 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align2: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -502,28 +502,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 ; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100 -; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; GFX10-NEXT: v_mov_b32_e32 v4, 24 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_perm_b32 v3, v3, v1, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v1, v4, v1, 16 ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v2, v2, v3 clamp +; GFX10-NEXT: v_mov_b32_e32 v3, 8 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp -; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_pk_lshrrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 24 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_uaddsat_v4i8: @@ -1379,19 +1379,33 @@ ; GFX9-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_uaddsat_v5i32: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, s0, s5 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v1, s1, s6 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v2, s2, s7 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v3, s3, s8 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v4, s4, s9 clamp -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_uaddsat_v5i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s5 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s6 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, s2, s7 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v3, s3, s8 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s4, s9 clamp +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_uaddsat_v5i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_nc_u32_e64 v0, s0, s5 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v1, s1, s6 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v2, s2, s7 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v3, s3, s8 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v4, s4, s9 clamp +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: v_readfirstlane_b32 s4, v4 +; GFX11-NEXT: ; return to shader part epilog %result = call <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) ret <5 x i32> %result } @@ -1502,8 +1516,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v16 clamp +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v17 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v18 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v19 clamp @@ -1519,15 +1533,15 @@ ; GFX10-NEXT: v_add_nc_u32_e64 v13, v13, v29 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v14, v14, v30 clamp ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v15, v15, v16 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_uaddsat_v16i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v16 clamp +; GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v17 clamp ; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v18 clamp ; GFX11-NEXT: v_add_nc_u32_e64 v3, v3, v19 clamp @@ -1543,7 +1557,7 @@ ; GFX11-NEXT: v_add_nc_u32_e64 v13, v13, v29 clamp ; GFX11-NEXT: v_add_nc_u32_e64 v14, v14, v30 clamp ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v15, v15, v16 clamp ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result @@ -1706,41 +1720,77 @@ ; GFX9-NEXT: v_readfirstlane_b32 s15, v15 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_uaddsat_v16i32: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, s0, s16 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v1, s1, s17 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v2, s2, s18 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v3, s3, s19 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v4, s4, s20 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v5, s5, s21 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v6, s6, s22 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v7, s7, s23 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v8, s8, s24 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v9, s9, s25 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v10, s10, s26 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v11, s11, s27 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v12, s12, s28 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v13, s13, s29 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v14, s14, s30 clamp -; GFX10PLUS-NEXT: v_add_nc_u32_e64 v15, s15, s31 clamp -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s6, v6 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s7, v7 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s8, v8 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s9, v9 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s10, v10 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s11, v11 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s12, v12 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s13, v13 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s14, v14 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s15, v15 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_uaddsat_v16i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s16 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s17 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, s2, s18 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v3, s3, s19 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s4, s20 clamp +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_add_nc_u32_e64 v1, s5, s21 clamp +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_add_nc_u32_e64 v2, s6, s22 clamp +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_add_nc_u32_e64 v3, s7, s23 clamp +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s8, s24 clamp +; GFX10-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-NEXT: v_add_nc_u32_e64 v1, s9, s25 clamp +; GFX10-NEXT: v_readfirstlane_b32 s6, v2 +; GFX10-NEXT: v_add_nc_u32_e64 v2, s10, s26 clamp +; GFX10-NEXT: v_readfirstlane_b32 s7, v3 +; GFX10-NEXT: v_add_nc_u32_e64 v3, s11, s27 clamp +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s12, s28 clamp +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_add_nc_u32_e64 v1, s13, s29 clamp +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_add_nc_u32_e64 v2, s14, s30 clamp +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_add_nc_u32_e64 v3, s15, s31 clamp +; GFX10-NEXT: v_readfirstlane_b32 s12, v0 +; GFX10-NEXT: v_readfirstlane_b32 s13, v1 +; GFX10-NEXT: v_readfirstlane_b32 s14, v2 +; GFX10-NEXT: v_readfirstlane_b32 s15, v3 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_uaddsat_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_nc_u32_e64 v0, s0, s16 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v1, s1, s17 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v2, s2, s18 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v3, s3, s19 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v4, s4, s20 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v5, s5, s21 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v6, s6, s22 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v7, s7, s23 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v8, s8, s24 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v9, s9, s25 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v10, s10, s26 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v11, s11, s27 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v12, s12, s28 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v13, s13, s29 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v14, s14, s30 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v15, s15, s31 clamp +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: v_readfirstlane_b32 s4, v4 +; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-NEXT: v_readfirstlane_b32 s7, v7 +; GFX11-NEXT: v_readfirstlane_b32 s8, v8 +; GFX11-NEXT: v_readfirstlane_b32 s9, v9 +; GFX11-NEXT: v_readfirstlane_b32 s10, v10 +; GFX11-NEXT: v_readfirstlane_b32 s11, v11 +; GFX11-NEXT: v_readfirstlane_b32 s12, v12 +; GFX11-NEXT: v_readfirstlane_b32 s13, v13 +; GFX11-NEXT: v_readfirstlane_b32 s14, v14 +; GFX11-NEXT: v_readfirstlane_b32 s15, v15 +; GFX11-NEXT: ; return to shader part epilog %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -403,120 +403,120 @@ ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0 ; GFX10-NEXT: v_mul_lo_u32 v4, s0, v2 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, s0, v3, 0 ; GFX10-NEXT: v_mul_lo_u32 v5, s1, v3 -; GFX10-NEXT: v_mul_hi_u32 v6, v2, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, s0, v3, 0 ; GFX10-NEXT: v_add3_u32 v1, v1, v4, v5 ; GFX10-NEXT: v_mul_lo_u32 v4, v2, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0 ; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX10-NEXT: v_mul_lo_u32 v7, v2, v1 -; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX10-NEXT: v_mul_hi_u32 v1, v2, v1 ; GFX10-NEXT: v_add_co_u32 v4, s2, v4, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 -; GFX10-NEXT: v_add_co_u32 v6, s2, v7, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s2 -; GFX10-NEXT: v_add_co_u32 v0, s2, v4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX10-NEXT: v_add_co_u32 v4, s2, v6, v8 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 +; GFX10-NEXT: v_mul_hi_u32 v0, v2, v0 +; GFX10-NEXT: v_add_co_u32 v4, s2, v4, v5 +; GFX10-NEXT: v_mul_lo_u32 v5, v2, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s2 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 +; GFX10-NEXT: v_add_co_u32 v0, s2, v5, v0 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6 -; GFX10-NEXT: v_add_co_u32 v0, s2, v4, v0 +; GFX10-NEXT: v_mul_hi_u32 v1, v2, v1 +; GFX10-NEXT: v_add_co_u32 v0, s2, v0, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v0, s2, v0, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v6, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s2 ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v0 ; GFX10-NEXT: v_add3_u32 v1, v5, v4, v1 ; GFX10-NEXT: v_mul_lo_u32 v4, s1, v3 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v1, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, s0, v3, 0 ; GFX10-NEXT: v_mul_lo_u32 v5, s0, v2 -; GFX10-NEXT: v_mul_hi_u32 v6, v2, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v3, 0 ; GFX10-NEXT: v_add3_u32 v1, v1, v5, v4 ; GFX10-NEXT: v_mul_lo_u32 v4, v2, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0 ; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX10-NEXT: v_mul_lo_u32 v7, v2, v1 -; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX10-NEXT: v_mul_hi_u32 v1, v2, v1 ; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v6, s0, v7, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v6, v8 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6 -; GFX10-NEXT: v_add_co_u32 v0, s0, v4, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, v2, v0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v5 +; GFX10-NEXT: v_mul_lo_u32 v5, v2, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 +; GFX10-NEXT: v_add_co_u32 v0, s0, v5, v0 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v1, v2, v1 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v6, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v3, v0 ; GFX10-NEXT: v_add3_u32 v1, v5, v4, v1 -; GFX10-NEXT: v_mul_hi_u32 v4, s9, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo ; GFX10-NEXT: v_mul_lo_u32 v2, s9, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX10-NEXT: v_mul_lo_u32 v5, s9, v1 -; GFX10-NEXT: v_mul_hi_u32 v6, s8, v1 -; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v5, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v4, v6 +; GFX10-NEXT: v_mul_hi_u32 v3, s8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v5, v4 -; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v3 +; GFX10-NEXT: v_mul_lo_u32 v3, s9, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v2 +; GFX10-NEXT: v_add_co_u32 v0, s0, v3, v0 +; GFX10-NEXT: v_mul_hi_u32 v3, s8, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v2, s0, v0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: v_mul_lo_u32 v4, s11, v2 -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, 1 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s10, v2, 0 ; GFX10-NEXT: v_mul_lo_u32 v5, s10, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add3_u32 v1, v1, v5, v4 -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v6, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v8, s9, v1 -; GFX10-NEXT: v_sub_co_u32 v9, vcc_lo, s8, v0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s0, s9, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v8, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v9, s10 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v11, s0, 0, v0, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v10 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s11, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v12, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v14, v13, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v8, s10 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, 0, v0, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v12 -; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v13, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v7, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v4, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v1, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v9, v6, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v5, s1 -; GFX10-NEXT: global_store_dwordx2 v7, v[0:1], s[4:5] -; GFX10-NEXT: global_store_dwordx2 v7, v[2:3], s[6:7] +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, s8, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s9, v1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s10, v4 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s9, v1 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, -1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v0, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s11, v1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, v4, s10 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v8, s0, 0, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s10, v7 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v1, s0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v2, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s0, 0, v3, s0 +; GFX10-NEXT: v_add_co_u32 v11, s0, v1, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s0, 0, v10, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v12, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v1, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v2, vcc_lo, s11, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v10, s0 +; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, v7, s10 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v6, s0 +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] ; GFX10-NEXT: s_endpgm %div = udiv i64 %x, %y store i64 %div, ptr addrspace(1) %out0 @@ -634,12 +634,10 @@ ; GFX10-LABEL: udivrem_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GFX10-NEXT: s_sub_i32 s0, 0, s10 -; GFX10-NEXT: s_sub_i32 s1, 0, s11 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -647,39 +645,41 @@ ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s1, v1 +; GFX10-NEXT: s_sub_i32 s0, 0, s11 +; GFX10-NEXT: v_mul_lo_u32 v3, s0, v1 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, v0, s10 ; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s8, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s11 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s8, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s9, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s10, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s11, v3 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[4:5] -; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[6:7] +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s10, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s11, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i32> %x, %y store <2 x i32> %div, ptr addrspace(1) %out0 @@ -885,95 +885,96 @@ ; GFX10-LABEL: udivrem_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s14 -; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s15 ; GFX10-NEXT: s_sub_i32 s0, 0, s12 +; GFX10-NEXT: s_sub_i32 s1, 0, s14 +; GFX10-NEXT: s_sub_i32 s2, 0, s15 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX10-NEXT: s_sub_i32 s1, 0, s13 -; GFX10-NEXT: s_sub_i32 s2, 0, s14 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0 +; GFX10-NEXT: s_sub_i32 s0, 0, s13 +; GFX10-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v3 +; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, v0, s12 +; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s8, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s12, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s13 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s9, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s13, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v3, v4, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s13, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, s1, v2 +; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-NEXT: v_mul_hi_u32 v2, s10, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, v2, s14 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s10, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v3, v6, s1 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v3, s1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s15 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v5, s1, v1 -; GFX10-NEXT: v_mul_lo_u32 v6, s2, v2 -; GFX10-NEXT: s_sub_i32 s0, 0, s15 -; GFX10-NEXT: v_mul_lo_u32 v7, s0, v3 -; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 -; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6 +; GFX10-NEXT: v_mul_lo_u32 v7, s2, v3 ; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v6 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v7 -; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX10-NEXT: v_mul_hi_u32 v2, s10, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, s11, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, v0, s12 -; GFX10-NEXT: v_mul_lo_u32 v5, v1, s13 -; GFX10-NEXT: v_mul_lo_u32 v6, v2, s14 -; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v7, v3, s15 -; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v12, 1, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, s8, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, s9, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s10, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v3 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, s11, v7 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s12, v4 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v5 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v6 ; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s12, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s13, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s14, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v12, s2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v12, s15, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v10, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v12, s2 -; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s12, v4 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v5 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v6 -; GFX10-NEXT: v_add_nc_u32_e32 v12, 1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v8, s2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s15, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v8, s2 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v3 ; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s12, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s13, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s14, v6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v13, s15, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v12, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v10, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v8, s2 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] -; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s12, v5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s13, v4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s14, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s15, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v2, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s2 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7] ; GFX10-NEXT: s_endpgm %div = udiv <4 x i32> %x, %y store <4 x i32> %div, ptr addrspace(1) %out0 @@ -1519,261 +1520,261 @@ ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s13 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12 -; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s14 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s12 ; GFX10-NEXT: s_sub_u32 s0, 0, s12 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: s_subb_u32 s1, 0, s13 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX10-NEXT: v_trunc_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX10-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0 +; GFX10-NEXT: v_mul_lo_u32 v4, s0, v2 +; GFX10-NEXT: v_mul_lo_u32 v5, s1, v3 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, s0, v3, 0 +; GFX10-NEXT: v_add3_u32 v1, v1, v4, v5 +; GFX10-NEXT: v_mul_lo_u32 v4, v2, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX10-NEXT: v_add_co_u32 v4, s2, v4, v5 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 +; GFX10-NEXT: v_mul_hi_u32 v0, v2, v0 +; GFX10-NEXT: v_add_co_u32 v4, s2, v4, v5 +; GFX10-NEXT: v_mul_lo_u32 v5, v2, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s2 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 +; GFX10-NEXT: v_add_co_u32 v0, s2, v5, v0 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v0, s2, v0, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v4, s2, v0, v4 +; GFX10-NEXT: v_mul_hi_u32 v0, v2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v6, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX10-NEXT: s_sub_u32 s2, 0, s14 ; GFX10-NEXT: s_subb_u32 s3, 0, s15 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v1, v0 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s15 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s14 +; GFX10-NEXT: v_mul_lo_u32 v4, s1, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, s0, v2 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GFX10-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1 -; GFX10-NEXT: v_trunc_f32_e32 v2, v2 -; GFX10-NEXT: v_trunc_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, 0xcf800000, v2 -; GFX10-NEXT: v_mul_f32_e32 v5, 0xcf800000, v3 -; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v3 -; GFX10-NEXT: v_add_f32_e32 v0, v4, v0 -; GFX10-NEXT: v_add_f32_e32 v1, v5, v1 -; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX10-NEXT: v_mul_lo_u32 v10, s2, v6 -; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v0 -; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v1 -; GFX10-NEXT: v_mul_lo_u32 v7, s0, v4 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s0, v5, 0 -; GFX10-NEXT: v_mul_lo_u32 v9, s1, v5 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s6, s2, v8, 0 -; GFX10-NEXT: v_mul_lo_u32 v11, s3, v8 -; GFX10-NEXT: v_add3_u32 v1, v1, v7, v9 -; GFX10-NEXT: v_mul_lo_u32 v7, v4, v0 -; GFX10-NEXT: v_mul_hi_u32 v9, v5, v0 -; GFX10-NEXT: v_add3_u32 v3, v3, v10, v11 -; GFX10-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX10-NEXT: v_mul_lo_u32 v12, v5, v1 -; GFX10-NEXT: v_mul_lo_u32 v13, v4, v1 -; GFX10-NEXT: v_mul_lo_u32 v10, v6, v2 -; GFX10-NEXT: v_mul_lo_u32 v15, v8, v3 -; GFX10-NEXT: v_mul_hi_u32 v11, v8, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, v6, v2 -; GFX10-NEXT: v_mul_lo_u32 v16, v6, v3 -; GFX10-NEXT: v_mul_hi_u32 v14, v5, v1 -; GFX10-NEXT: v_add_co_u32 v7, s6, v7, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v0, s6, v13, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v10, s6, v10, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v2, s6, v16, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v7, s6, v7, v9 -; GFX10-NEXT: v_mul_hi_u32 v17, v8, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v0, s6, v0, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v10, s6, v10, v11 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX10-NEXT: v_trunc_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_f32_e32 v6, 0xcf800000, v1 +; GFX10-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v1 +; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0 +; GFX10-NEXT: v_mul_lo_u32 v8, s2, v6 +; GFX10-NEXT: v_mul_lo_u32 v9, s3, v7 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s2, v7, 0 +; GFX10-NEXT: v_add3_u32 v1, v1, v8, v9 +; GFX10-NEXT: v_mul_lo_u32 v8, v6, v0 +; GFX10-NEXT: v_mul_lo_u32 v9, v7, v1 +; GFX10-NEXT: v_add_co_u32 v8, s6, v8, v9 +; GFX10-NEXT: v_mul_hi_u32 v9, v7, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s6 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v12, v7 -; GFX10-NEXT: v_add_co_u32 v2, s6, v2, v17 -; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v15, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v0, s6, v0, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v9, v13, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s6 -; GFX10-NEXT: v_mul_hi_u32 v3, v6, v3 -; GFX10-NEXT: v_add_co_u32 v2, s6, v2, v10 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v16, v11 +; GFX10-NEXT: v_mul_hi_u32 v0, v6, v0 +; GFX10-NEXT: v_add_co_u32 v8, s6, v8, v9 +; GFX10-NEXT: v_mul_lo_u32 v9, v6, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s6 +; GFX10-NEXT: v_add_nc_u32_e32 v8, v10, v8 +; GFX10-NEXT: v_add_co_u32 v0, s6, v9, v0 +; GFX10-NEXT: v_mul_hi_u32 v9, v7, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s6 -; GFX10-NEXT: v_add3_u32 v1, v9, v7, v1 -; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v5, v0 -; GFX10-NEXT: v_add3_u32 v3, v11, v10, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v3, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s0, v5, 0 -; GFX10-NEXT: v_mul_lo_u32 v7, s1, v5 -; GFX10-NEXT: v_mul_lo_u32 v9, s0, v4 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s2, v8, 0 -; GFX10-NEXT: v_mul_lo_u32 v10, s3, v8 -; GFX10-NEXT: v_mul_lo_u32 v11, s2, v6 -; GFX10-NEXT: v_mul_lo_u32 v12, v4, v0 -; GFX10-NEXT: v_mul_hi_u32 v13, v5, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX10-NEXT: v_add3_u32 v1, v1, v9, v7 -; GFX10-NEXT: v_mul_lo_u32 v7, v6, v2 -; GFX10-NEXT: v_mul_hi_u32 v9, v8, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, v6, v2 -; GFX10-NEXT: v_add3_u32 v3, v3, v11, v10 -; GFX10-NEXT: v_mul_lo_u32 v10, v5, v1 -; GFX10-NEXT: v_mul_lo_u32 v11, v4, v1 -; GFX10-NEXT: v_mul_hi_u32 v14, v5, v1 -; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX10-NEXT: v_mul_lo_u32 v15, v8, v3 -; GFX10-NEXT: v_mul_lo_u32 v16, v6, v3 -; GFX10-NEXT: v_mul_hi_u32 v17, v8, v3 -; GFX10-NEXT: v_mul_hi_u32 v3, v6, v3 -; GFX10-NEXT: v_add_co_u32 v10, s0, v12, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v11, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v7, s0, v7, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v16, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v10, s0, v10, v13 +; GFX10-NEXT: v_add_co_u32 v0, s6, v0, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v8, s6, v0, v8 +; GFX10-NEXT: v_mul_hi_u32 v0, v6, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v9, v10, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v7, v8 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_add3_u32 v9, v9, v1, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v3, 0 +; GFX10-NEXT: v_mul_lo_u32 v8, s3, v7 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v9, vcc_lo +; GFX10-NEXT: v_add3_u32 v1, v1, v5, v4 +; GFX10-NEXT: v_mul_lo_u32 v4, v2, v0 +; GFX10-NEXT: v_mul_lo_u32 v9, s2, v6 +; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v5 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v0, v2, v0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v5 +; GFX10-NEXT: v_mul_lo_u32 v5, v2, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v10, v4 +; GFX10-NEXT: v_add_co_u32 v0, s0, v5, v0 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v7, s0, v7, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v9, v12, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v17 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v0, v4 +; GFX10-NEXT: v_mul_hi_u32 v0, v2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v10, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v1, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v7, 0 +; GFX10-NEXT: v_mul_lo_u32 v4, s9, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo +; GFX10-NEXT: v_add3_u32 v1, v1, v9, v8 +; GFX10-NEXT: v_mul_lo_u32 v8, v6, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, s8, v2 +; GFX10-NEXT: v_mul_lo_u32 v9, v7, v1 +; GFX10-NEXT: v_add_co_u32 v8, s0, v8, v9 +; GFX10-NEXT: v_mul_hi_u32 v9, v7, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v0, v6, v0 +; GFX10-NEXT: v_add_co_u32 v8, s0, v8, v9 +; GFX10-NEXT: v_mul_lo_u32 v9, v6, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v8, v10, v8 +; GFX10-NEXT: v_add_co_u32 v0, s0, v9, v0 +; GFX10-NEXT: v_mul_hi_u32 v9, v7, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v1, v6, v1 ; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v11, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v15, v7 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v5, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v16, v10 -; GFX10-NEXT: v_add3_u32 v1, v11, v9, v1 -; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_mul_hi_u32 v5, s8, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v4, s9, v0 -; GFX10-NEXT: v_add3_u32 v3, v10, v7, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX10-NEXT: v_mul_lo_u32 v7, s8, v1 -; GFX10-NEXT: v_mul_lo_u32 v10, s9, v1 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v6, v3, vcc_lo -; GFX10-NEXT: v_mul_hi_u32 v6, s8, v1 -; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v10, v0 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v5 +; GFX10-NEXT: v_mul_hi_u32 v5, s8, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v9, v10, v9 +; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v7, v0 +; GFX10-NEXT: v_add3_u32 v1, v9, v8, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v5 +; GFX10-NEXT: v_mul_lo_u32 v5, s9, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v6, v1, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v4, v8, v4 +; GFX10-NEXT: v_add_co_u32 v3, s0, v5, v3 +; GFX10-NEXT: v_mul_hi_u32 v5, s8, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v2, s9, v2 +; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_mul_lo_u32 v6, s11, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v7, v4 -; GFX10-NEXT: v_mul_lo_u32 v7, s10, v3 -; GFX10-NEXT: v_mul_lo_u32 v10, s11, v3 +; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5 -; GFX10-NEXT: v_mul_hi_u32 v8, s10, v2 -; GFX10-NEXT: v_add_co_u32 v4, s0, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: v_mul_hi_u32 v2, s11, v2 -; GFX10-NEXT: v_mul_hi_u32 v11, s10, v3 -; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s12, v4, 0 -; GFX10-NEXT: v_mul_lo_u32 v12, s13, v4 -; GFX10-NEXT: v_mul_lo_u32 v13, s12, v5 -; GFX10-NEXT: v_add_co_u32 v2, s0, v10, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_mul_lo_u32 v7, s13, v3 +; GFX10-NEXT: v_add3_u32 v2, v5, v4, v2 +; GFX10-NEXT: v_mul_lo_u32 v4, s11, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, s10, v1 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v5 +; GFX10-NEXT: v_mul_hi_u32 v5, s10, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_add3_u32 v1, v1, v13, v12 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v7, v6 -; GFX10-NEXT: v_mul_hi_u32 v3, s11, v3 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v10, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v8, s9, v1 -; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, s8, v0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s0, s9, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s13, v8, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s12, v10 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v10, s12 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v12, s0, 0, v0, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v11 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s13, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v16, s0, v4, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v5, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v15, v14, s0 -; GFX10-NEXT: v_add_co_u32 v6, s0, v2, v6 +; GFX10-NEXT: v_mul_hi_u32 v0, s11, v0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v5 +; GFX10-NEXT: v_mul_lo_u32 v5, s11, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 +; GFX10-NEXT: v_add_co_u32 v0, s0, v5, v0 +; GFX10-NEXT: v_mul_hi_u32 v5, s10, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v0, v4 +; GFX10-NEXT: v_mul_hi_u32 v0, s11, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v6, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v15, s0, v16, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s0, 0, v17, s0 -; GFX10-NEXT: v_add3_u32 v3, v7, v1, v3 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s14, v6, 0 -; GFX10-NEXT: v_mul_lo_u32 v19, s15, v6 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX10-NEXT: v_mul_lo_u32 v7, s14, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v16, s0, v8, s12 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v20, s0, 0, v0, s0 -; GFX10-NEXT: v_add3_u32 v2, v2, v7, v19 -; GFX10-NEXT: v_sub_co_u32 v7, s0, s10, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v13 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v13, s1, s11, v2, s0 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s11, v2 -; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v15, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v17, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v16, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v20, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v13 +; GFX10-NEXT: v_mul_lo_u32 v6, s12, v2 +; GFX10-NEXT: v_add3_u32 v5, v5, v1, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s12, v3, 0 +; GFX10-NEXT: v_add3_u32 v1, v1, v6, v7 +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, s8, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v7, s0, s9, v1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s9, v1 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, -1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s0 -; GFX10-NEXT: v_sub_co_u32 v14, s0, v7, s14 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s2, 0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s15, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s1 -; GFX10-NEXT: v_add_co_u32 v16, s1, v6, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s1, 0, v3, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1 -; GFX10-NEXT: v_add_co_u32 v10, s1, v16, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s1, 0, v17, s1 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 -; GFX10-NEXT: v_sub_co_u32 v8, s1, v14, s14 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s1, 0, v2, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v16, v10, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v17, v18, s0 -; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v14, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v15, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v10, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v16, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, v8, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v13, v14, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s13, v1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v9, vcc_lo, v6, s12 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, s0, 0, v8, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v9 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v1, s0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v3, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s0, 0, v2, s0 +; GFX10-NEXT: v_add_co_u32 v13, s0, v1, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v12, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v14, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; GFX10-NEXT: v_mul_lo_u32 v13, s14, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, v12, s0 +; GFX10-NEXT: v_mul_lo_u32 v12, s15, v4 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s1, s14, v4, 0 +; GFX10-NEXT: v_add3_u32 v3, v3, v13, v12 +; GFX10-NEXT: v_sub_co_u32 v12, s1, s10, v2 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v13, s2, s11, v3, s1 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s14, v12 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s11, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s3, s15, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s15, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v14, s2 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s1, s15, v3, s1 +; GFX10-NEXT: v_sub_co_u32 v15, s1, v12, s14 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s2, 0, v14, s1 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s14, v15 +; GFX10-NEXT: v_cmp_le_u32_e64 s3, s15, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, -1, s2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s15, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s3 +; GFX10-NEXT: v_cmp_ne_u32_e64 s3, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v17, s2 +; GFX10-NEXT: v_add_co_u32 v17, s2, v4, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s2, 0, v5, s2 +; GFX10-NEXT: v_add_co_u32 v19, s2, v17, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v20, s2, 0, v18, s2 +; GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v17, v19, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v3, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v18, v20, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v3, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v9, v[0:3], s[4:5] -; GFX10-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s13, v8, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v1, vcc_lo, v9, s12 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, 0, v0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v7, v2, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, vcc_lo, s15, v14, s1 +; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, v15, s14 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v15, v3, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v16, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v12, v2, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v13, v3, s3 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 @@ -2033,7 +2034,6 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -2045,44 +2045,45 @@ ; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, v0, s2 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v3 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: s_movk_i32 s1, 0xff -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 -; GFX10-NEXT: v_and_b32_sdwa v0, v0, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_short v1, v0, s[4:5] -; GFX10-NEXT: global_store_short v1, v2, s[6:7] +; GFX10-NEXT: global_store_short v1, v0, s[0:1] +; GFX10-NEXT: global_store_short v1, v2, s[2:3] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i8> %x, %y store <2 x i8> %div, ptr addrspace(1) %out0 @@ -2338,7 +2339,6 @@ ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 -; GFX10-NEXT: s_sub_i32 s6, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2346,46 +2346,47 @@ ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: s_sub_i32 s3, 0, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1 ; GFX10-NEXT: s_and_b32 s3, s0, 0xffff ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, v0, s2 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v3 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: global_store_dword v1, v2, s[6:7] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v2, s[2:3] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i16> %x, %y store <2 x i16> %div, ptr addrspace(1) %out0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -490,28 +490,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 ; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100 -; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; GFX10-NEXT: v_mov_b32_e32 v4, 24 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_perm_b32 v3, v3, v1, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v1, v4, v1, 16 ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v2, v2, v3 clamp +; GFX10-NEXT: v_mov_b32_e32 v3, 8 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp -; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_pk_lshrrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 24 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_usubsat_v4i8: @@ -1329,19 +1329,33 @@ ; GFX9-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_usubsat_v5i32: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s5 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, s1, s6 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, s2, s7 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, s3, s8 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v4, s4, s9 clamp -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_usubsat_v5i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s5 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s6 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, s2, s7 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, s3, s8 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s4, s9 clamp +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_usubsat_v5i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_sub_nc_u32_e64 v0, s0, s5 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v1, s1, s6 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v2, s2, s7 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v3, s3, s8 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v4, s4, s9 clamp +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: v_readfirstlane_b32 s4, v4 +; GFX11-NEXT: ; return to shader part epilog %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) ret <5 x i32> %result } @@ -1436,8 +1450,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp ; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp ; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp @@ -1453,15 +1467,15 @@ ; GFX10-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp ; GFX10-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v16 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_usubsat_v16i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp +; GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GFX11-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp ; GFX11-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp ; GFX11-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp @@ -1477,7 +1491,7 @@ ; GFX11-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp ; GFX11-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v15, v15, v16 clamp ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result @@ -1624,41 +1638,77 @@ ; GFX9-NEXT: v_readfirstlane_b32 s15, v15 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_usubsat_v16i32: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s16 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, s1, s17 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, s2, s18 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, s3, s19 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v4, s4, s20 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v5, s5, s21 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v6, s6, s22 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v7, s7, s23 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v8, s8, s24 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v9, s9, s25 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v10, s10, s26 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v11, s11, s27 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v12, s12, s28 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v13, s13, s29 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v14, s14, s30 clamp -; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v15, s15, s31 clamp -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s6, v6 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s7, v7 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s8, v8 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s9, v9 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s10, v10 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s11, v11 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s12, v12 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s13, v13 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s14, v14 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s15, v15 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_usubsat_v16i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s16 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s17 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, s2, s18 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, s3, s19 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s4, s20 clamp +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_sub_nc_u32_e64 v1, s5, s21 clamp +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_sub_nc_u32_e64 v2, s6, s22 clamp +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_sub_nc_u32_e64 v3, s7, s23 clamp +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s8, s24 clamp +; GFX10-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-NEXT: v_sub_nc_u32_e64 v1, s9, s25 clamp +; GFX10-NEXT: v_readfirstlane_b32 s6, v2 +; GFX10-NEXT: v_sub_nc_u32_e64 v2, s10, s26 clamp +; GFX10-NEXT: v_readfirstlane_b32 s7, v3 +; GFX10-NEXT: v_sub_nc_u32_e64 v3, s11, s27 clamp +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s12, s28 clamp +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_sub_nc_u32_e64 v1, s13, s29 clamp +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_sub_nc_u32_e64 v2, s14, s30 clamp +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_sub_nc_u32_e64 v3, s15, s31 clamp +; GFX10-NEXT: v_readfirstlane_b32 s12, v0 +; GFX10-NEXT: v_readfirstlane_b32 s13, v1 +; GFX10-NEXT: v_readfirstlane_b32 s14, v2 +; GFX10-NEXT: v_readfirstlane_b32 s15, v3 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_usubsat_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_sub_nc_u32_e64 v0, s0, s16 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v1, s1, s17 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v2, s2, s18 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v3, s3, s19 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v4, s4, s20 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v5, s5, s21 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v6, s6, s22 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v7, s7, s23 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v8, s8, s24 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v9, s9, s25 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v10, s10, s26 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v11, s11, s27 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v12, s12, s28 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v13, s13, s29 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v14, s14, s30 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v15, s15, s31 clamp +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: v_readfirstlane_b32 s4, v4 +; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-NEXT: v_readfirstlane_b32 s7, v7 +; GFX11-NEXT: v_readfirstlane_b32 s8, v8 +; GFX11-NEXT: v_readfirstlane_b32 s9, v9 +; GFX11-NEXT: v_readfirstlane_b32 s10, v10 +; GFX11-NEXT: v_readfirstlane_b32 s11, v11 +; GFX11-NEXT: v_readfirstlane_b32 s12, v12 +; GFX11-NEXT: v_readfirstlane_b32 s13, v13 +; GFX11-NEXT: v_readfirstlane_b32 s14, v14 +; GFX11-NEXT: v_readfirstlane_b32 s15, v15 +; GFX11-NEXT: ; return to shader part epilog %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result } Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2958,15 +2958,14 @@ ; GFX1032-NEXT: .LBB10_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s2, v2, 0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, 0 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s3, v2, v[4:5] -; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3 -; GFX1032-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2] +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s @@ -3144,16 +3144,15 @@ ; GFX1032-NEXT: .LBB12_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, 0 +; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2] +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s4, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 -; GFX1032-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/bf16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/bf16.ll +++ llvm/test/CodeGen/AMDGPU/bf16.ll @@ -3141,10 +3141,6 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112 ; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108 ; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 @@ -3173,13 +3169,16 @@ ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 ; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:124 +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:120 +; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:124 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:120 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:116 +; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:116 ; GFX10-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll +++ llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll @@ -19,29 +19,29 @@ ; ISA-NEXT: s_cselect_b32 s6, s5, 0 ; ISA-NEXT: s_lshr_b32 s7, 1, s4 ; ISA-NEXT: s_cmp_lg_u32 s4, 0 -; ISA-NEXT: v_cvt_f32_i32_e32 v0, s6 +; ISA-NEXT: v_cvt_f32_i32_e32 v4, s6 ; ISA-NEXT: s_cselect_b32 s8, -1, 0 ; ISA-NEXT: s_and_b32 s8, s8, exec_lo ; ISA-NEXT: s_cselect_b32 s7, s7, 0 ; ISA-NEXT: s_lshr_b32 s5, s5, 1 ; ISA-NEXT: s_cmp_lg_u32 s4, 0 -; ISA-NEXT: v_cvt_f32_ubyte0_e32 v4, s7 +; ISA-NEXT: v_cvt_f32_ubyte0_e32 v5, s7 ; ISA-NEXT: s_cselect_b32 s4, -1, 0 ; ISA-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s4 ; ISA-NEXT: s_and_b32 s4, s4, exec_lo ; ISA-NEXT: s_cselect_b32 s4, s5, 0 -; ISA-NEXT: v_cvt_f32_i32_e32 v5, s4 +; ISA-NEXT: v_cvt_f32_i32_e32 v0, s4 ; ISA-NEXT: s_mov_b32 s4, 0 -; ISA-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; ISA-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; ISA-NEXT: .LBB0_1: ; %bb14 ; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 ; ISA-NEXT: v_mov_b32_e32 v6, v7 ; ISA-NEXT: s_and_b32 s5, exec_lo, vcc_lo ; ISA-NEXT: s_or_b32 s4, s5, s4 ; ISA-NEXT: v_add_f32_e32 v7, v6, v3 +; ISA-NEXT: v_add_f32_e32 v7, v7, v0 ; ISA-NEXT: v_add_f32_e32 v7, v7, v5 ; ISA-NEXT: v_add_f32_e32 v7, v7, v4 -; ISA-NEXT: v_add_f32_e32 v7, v7, v0 ; ISA-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; ISA-NEXT: s_cbranch_execnz .LBB0_1 ; ISA-NEXT: ; %bb.2: ; %bb21 Index: llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -525,12 +525,10 @@ ; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10_DEFAULT-NEXT: s_clause 0x1 -; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6 -; GFX10_DEFAULT-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4 -; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(1) -; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v1, v0 +; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4 +; GFX10_DEFAULT-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:6 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) -; GFX10_DEFAULT-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; GFX10_DEFAULT-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) ; GFX10_DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] Index: llvm/test/CodeGen/AMDGPU/cluster_stores.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -51,47 +51,46 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s4, s0, 8 -; GFX10-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-NEXT: s_add_u32 s6, s0, 16 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_addc_u32 s7, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_add_u32 s4, s0, 8 +; GFX10-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-NEXT: flat_load_dword v2, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_add_u32 s4, s0, 16 +; GFX10-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-NEXT: s_add_u32 s0, s0, 24 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s6 -; GFX10-NEXT: v_mov_b32_e32 v5, s7 -; GFX10-NEXT: v_mov_b32_e32 v7, s1 -; GFX10-NEXT: v_mov_b32_e32 v6, s0 -; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: flat_load_dword v8, v[0:1] -; GFX10-NEXT: flat_load_dword v9, v[2:3] -; GFX10-NEXT: flat_load_dword v10, v[4:5] -; GFX10-NEXT: flat_load_dword v11, v[6:7] +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_add_u32 s0, s2, 8 ; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s2, 16 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_add_u32 s2, s2, 24 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s1 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v7, s3 -; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; GFX10-NEXT: flat_store_dword v[0:1], v8 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_add_u32 s0, s2, 16 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) -; GFX10-NEXT: flat_store_dword v[2:3], v9 +; GFX10-NEXT: flat_store_dword v[0:1], v3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_add_u32 s0, s2, 24 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) -; GFX10-NEXT: flat_store_dword v[4:5], v10 +; GFX10-NEXT: flat_store_dword v[0:1], v4 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) -; GFX10-NEXT: flat_store_dword v[6:7], v11 +; GFX10-NEXT: flat_store_dword v[0:1], v5 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: cluster_load_cluster_store: @@ -178,48 +177,47 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_add_u32 s4, s0, 8 ; GFX10-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: s_add_u32 s6, s0, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_addc_u32 s7, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_load_dword v2, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: s_add_u32 s0, s0, 24 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s6 -; GFX10-NEXT: v_mov_b32_e32 v5, s7 -; GFX10-NEXT: flat_load_dword v6, v[2:3] -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: flat_load_dword v8, v[0:1] -; GFX10-NEXT: flat_load_dword v9, v[4:5] -; GFX10-NEXT: flat_load_dword v10, v[2:3] +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_add_u32 s0, s2, 8 ; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: s_add_u32 s4, s2, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: s_addc_u32 s5, s3, 0 +; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s2, 24 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6 -; GFX10-NEXT: v_mov_b32_e32 v7, s1 -; GFX10-NEXT: v_mov_b32_e32 v6, s0 -; GFX10-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; GFX10-NEXT: flat_store_dword v[0:1], v8 -; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) -; GFX10-NEXT: flat_store_dword v[4:5], v9 -; GFX10-NEXT: flat_store_dword v[2:3], v11 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; GFX10-NEXT: flat_store_dword v[0:1], v3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_add_u32 s0, s2, 24 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v4 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) -; GFX10-NEXT: flat_store_dword v[6:7], v10 +; GFX10-NEXT: flat_store_dword v[0:1], v5 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: cluster_load_valu_cluster_store: @@ -421,28 +419,27 @@ ; ; GFX10-LABEL: cluster_image_sample: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: v_cvt_f32_i32_e32 v8, v0 -; GFX10-NEXT: v_cvt_f32_i32_e32 v9, v1 +; GFX10-NEXT: v_cvt_f32_i32_e32 v12, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v10, 1.0 -; GFX10-NEXT: v_add_f32_e32 v2, 1.0, v8 -; GFX10-NEXT: v_add_f32_e32 v3, 1.0, v9 +; GFX10-NEXT: v_cvt_f32_i32_e32 v13, v1 +; GFX10-NEXT: v_add_f32_e32 v2, 1.0, v12 ; GFX10-NEXT: v_mov_b32_e32 v5, v4 +; GFX10-NEXT: v_add_f32_e32 v3, 1.0, v13 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v7, v4 -; GFX10-NEXT: v_add_f32_e32 v8, 2.0, v8 -; GFX10-NEXT: v_add_f32_e32 v9, 2.0, v9 -; GFX10-NEXT: v_mov_b32_e32 v11, v10 -; GFX10-NEXT: v_mov_b32_e32 v12, v10 -; GFX10-NEXT: v_mov_b32_e32 v13, v10 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: image_sample_d v[14:17], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: image_sample_d v[18:21], v[8:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_sample_d v[8:11], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX10-NEXT: v_add_f32_e32 v2, 2.0, v12 +; GFX10-NEXT: v_add_f32_e32 v3, 2.0, v13 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_mov_b32_e32 v7, v4 +; GFX10-NEXT: image_sample_d v[2:5], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v5, v17, v21 -; GFX10-NEXT: v_add_f32_e32 v4, v16, v20 -; GFX10-NEXT: v_add_f32_e32 v3, v15, v19 -; GFX10-NEXT: v_add_f32_e32 v2, v14, v18 +; GFX10-NEXT: v_add_f32_e32 v5, v11, v5 +; GFX10-NEXT: v_add_f32_e32 v4, v10, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v9, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v8, v2 ; GFX10-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; GFX10-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1358,22 +1358,23 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 ; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned: @@ -1513,31 +1514,31 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 24 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v4, v0, s[2:3] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0 -; GFX10-NEXT: v_add_nc_u16 v4, v0, 9 -; GFX10-NEXT: v_add_nc_u16 v2, v2, 9 +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, 24 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX10-NEXT: v_add_nc_u16 v2, v4, 9 +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u16 v1, v1, 9 +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff00, v4 +; GFX10-NEXT: v_add_nc_u16 v0, v0, 0x900 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_add_nc_u16 v1, v1, 0x900 -; GFX10-NEXT: v_add_nc_u16 v5, v2, 0x900 -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX10-NEXT: global_store_dword v4, v5, s[2:3] +; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dword v5, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_2_uses: @@ -1710,31 +1711,30 @@ ; GFX10-LABEL: load_v7i8_to_v7f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x5 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_short_d16 v7, v0, s[2:3] offset:4 -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 -; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 -; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 -; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: global_load_ubyte v5, v4, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v0, v4, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v1, v4, s[2:3] offset:2 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 +; GFX10-NEXT: global_load_ubyte v0, v4, s[2:3] offset:1 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v7 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 +; GFX10-NEXT: global_load_ubyte v0, v4, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: global_load_short_d16 v0, v4, s[2:3] offset:4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] offset:16 ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v7i8_to_v7f32: @@ -1861,20 +1861,20 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v7, v9 -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v6, v9 -; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v9 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v9 -; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v8 -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v8 -; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v8 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v5 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v5 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v5 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:16 +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 +; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v8i8_to_v8f32: @@ -2236,22 +2236,23 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 ; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32: Index: llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll +++ llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll @@ -5,64 +5,77 @@ ; GCN-LABEL: _amdgpu_ps_main: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D -; GCN-NEXT: image_sample v3, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: image_load_mip v4, v[2:4], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm -; GCN-NEXT: s_clause 0x3 -; GCN-NEXT: s_buffer_load_dword s24, s[0:3], 0x5c -; GCN-NEXT: s_buffer_load_dword s28, s[0:3], 0x7c -; GCN-NEXT: s_buffer_load_dword s29, s[0:3], 0xc0 +; GCN-NEXT: s_clause 0x2 +; GCN-NEXT: s_buffer_load_dword s8, s[0:3], 0x5c +; GCN-NEXT: s_buffer_load_dword s9, s[0:3], 0x7c ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x40 +; GCN-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x60 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x20 +; GCN-NEXT: v_sub_f32_e64 v2, s8, s9 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, s2, v0 +; GCN-NEXT: v_fma_f32 v1, v1, v2, s9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_fma_f32 v0, -v0, s2, s6 +; GCN-NEXT: v_max_f32_e64 v2, s0, s0 clamp +; GCN-NEXT: v_fmac_f32_e32 v3, v0, v2 +; GCN-NEXT: image_sample v0, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D +; GCN-NEXT: s_waitcnt_depctr 0xffe3 +; GCN-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x40 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mad_f32 v5, s2, v2, v0 ; GCN-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x50 -; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_fma_f32 v4, -s2, v2, s6 ; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x2c -; GCN-NEXT: v_sub_f32_e64 v5, s24, s28 +; GCN-NEXT: v_fmac_f32_e32 v5, v4, v2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_clause 0x4 -; GCN-NEXT: s_buffer_load_dwordx4 s[8:11], s[0:3], 0x60 -; GCN-NEXT: s_buffer_load_dwordx4 s[12:15], s[0:3], 0x20 -; GCN-NEXT: s_buffer_load_dwordx4 s[16:19], s[0:3], 0x0 -; GCN-NEXT: s_buffer_load_dwordx4 s[20:23], s[0:3], 0x70 -; GCN-NEXT: s_buffer_load_dwordx4 s[24:27], s[0:3], 0x10 -; GCN-NEXT: v_fma_f32 v1, v1, v5, s28 -; GCN-NEXT: v_max_f32_e64 v6, s0, s0 clamp -; GCN-NEXT: v_add_f32_e64 v5, s29, -1.0 -; GCN-NEXT: v_sub_f32_e32 v8, s0, v1 -; GCN-NEXT: v_fma_f32 v7, -s2, v6, s6 -; GCN-NEXT: v_fma_f32 v5, v6, v5, 1.0 -; GCN-NEXT: v_mad_f32 v10, s2, v6, v2 -; GCN-NEXT: s_mov_b32 s0, 0x3c23d70a -; GCN-NEXT: v_fmac_f32_e32 v1, v6, v8 -; GCN-NEXT: v_fmac_f32_e32 v10, v7, v6 +; GCN-NEXT: v_sub_f32_e32 v4, s0, v1 +; GCN-NEXT: v_fmac_f32_e32 v1, v2, v4 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: image_load_mip v4, v[2:4], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm +; GCN-NEXT: s_waitcnt_depctr 0xffe3 +; GCN-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x70 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v4, v4, v5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v9, s10, v0 -; GCN-NEXT: v_fma_f32 v0, -v0, s10, s14 -; GCN-NEXT: v_mul_f32_e32 v8, s18, v2 -; GCN-NEXT: v_mul_f32_e32 v3, s22, v3 -; GCN-NEXT: v_fmac_f32_e32 v9, v0, v6 -; GCN-NEXT: v_sub_f32_e32 v0, v1, v5 -; GCN-NEXT: v_mul_f32_e32 v1, v8, v6 -; GCN-NEXT: v_mul_f32_e32 v7, v6, v3 -; GCN-NEXT: v_fma_f32 v3, -v6, v3, v9 -; GCN-NEXT: v_fmac_f32_e32 v5, v0, v6 -; GCN-NEXT: v_fma_f32 v0, v2, s26, -v1 -; GCN-NEXT: v_fmac_f32_e32 v7, v3, v6 -; GCN-NEXT: v_fmac_f32_e32 v1, v0, v6 -; GCN-NEXT: v_mul_f32_e32 v0, v2, v6 +; GCN-NEXT: image_sample v5, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v4, v4, v10 -; GCN-NEXT: v_mul_f32_e32 v3, v4, v6 -; GCN-NEXT: v_fmaak_f32 v4, s0, v5, 0x3ca3d70a -; GCN-NEXT: v_mul_f32_e32 v1, v3, v1 -; GCN-NEXT: v_mul_f32_e32 v2, v7, v4 +; GCN-NEXT: v_mul_f32_e32 v5, s2, v5 +; GCN-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x0 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GCN-NEXT: v_fma_f32 v3, -v2, v5, v3 +; GCN-NEXT: v_mul_f32_e32 v5, v2, v5 +; GCN-NEXT: v_fmac_f32_e32 v5, v3, v2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x10 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0xc0 +; GCN-NEXT: v_mul_f32_e32 v3, s2, v0 +; GCN-NEXT: v_mul_f32_e32 v3, v3, v2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_fma_f32 v6, v0, s6, -v3 +; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-NEXT: v_fmac_f32_e32 v3, v6, v2 +; GCN-NEXT: v_add_f32_e64 v6, s0, -1.0 +; GCN-NEXT: s_mov_b32 s0, 0x3c23d70a +; GCN-NEXT: v_fma_f32 v6, v2, v6, 1.0 +; GCN-NEXT: v_sub_f32_e32 v1, v1, v6 +; GCN-NEXT: v_fmac_f32_e32 v6, v1, v2 +; GCN-NEXT: v_mul_f32_e32 v1, v4, v2 +; GCN-NEXT: v_fmaak_f32 v2, s0, v6, 0x3ca3d70a +; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-NEXT: v_mul_f32_e32 v2, v5, v2 ; GCN-NEXT: v_fmac_f32_e32 v1, v2, v0 ; GCN-NEXT: v_max_f32_e32 v0, 0, v1 ; GCN-NEXT: ; return to shader part epilog Index: llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -92,15 +92,15 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX10-NEXT: ds_write_b32 v0, v1 offset:12 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: ds_write_b32 v2, v3 offset:12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0 -; GFX10-NEXT: global_store_dword v[0:1], v4, off +; GFX10-NEXT: v_div_fmas_f32 v2, s0, s0, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; @@ -173,15 +173,15 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-NEXT: v_not_b32_e32 v0, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: ds_write_b32 v0, v1 offset:65532 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: ds_write_b32 v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0 -; GFX10-NEXT: global_store_dword v[0:1], v4, off +; GFX10-NEXT: v_div_fmas_f32 v2, s0, s0, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; @@ -565,17 +565,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0x7b -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX10-NEXT: ds_write_b32 v0, v1 offset:1023 +; GFX10-NEXT: ds_write_b32 v0, v2 offset:1019 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: ds_write_b32 v2, v3 offset:1023 -; GFX10-NEXT: ds_write_b32 v2, v4 offset:1019 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_fmas_f32 v5, s0, s0, s0 -; GFX10-NEXT: global_store_dword v[0:1], v5, off +; GFX10-NEXT: v_div_fmas_f32 v2, s0, s0, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv.ll +++ llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -319,8 +319,7 @@ ; FUNC-LABEL: {{^}}fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] -; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] - +; PREGFX10-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] ; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] ; PREGFX10-NOT: s_setreg ; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 @@ -332,9 +331,10 @@ ; PREGFX10-NOT: s_setreg ; GFX10-NOT: s_denorm_mode -; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 -; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] -; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; GFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]], {{[^,]+}}, 1.0 +; GFX10-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] +; GFX10-DAG: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; GFX10-DAG: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] ; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] ; GFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] ; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]] Index: llvm/test/CodeGen/AMDGPU/fshl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fshl.ll +++ llvm/test/CodeGen/AMDGPU/fshl.ll @@ -537,21 +537,21 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1 -; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1 -; GFX10-NEXT: v_alignbit_b32 v5, s5, s9, 1 -; GFX10-NEXT: v_alignbit_b32 v6, s4, s8, 1 ; GFX10-NEXT: s_lshr_b32 s2, s7, 1 ; GFX10-NEXT: s_not_b32 s3, s15 +; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1 ; GFX10-NEXT: s_lshr_b32 s6, s6, 1 -; GFX10-NEXT: s_not_b32 s7, s14 -; GFX10-NEXT: s_lshr_b32 s5, s5, 1 -; GFX10-NEXT: s_not_b32 s9, s13 -; GFX10-NEXT: s_lshr_b32 s4, s4, 1 -; GFX10-NEXT: s_not_b32 s8, s12 ; GFX10-NEXT: v_alignbit_b32 v3, s2, v0, s3 +; GFX10-NEXT: v_alignbit_b32 v0, s5, s9, 1 +; GFX10-NEXT: s_not_b32 s7, s14 +; GFX10-NEXT: s_lshr_b32 s2, s5, 1 +; GFX10-NEXT: s_not_b32 s3, s13 ; GFX10-NEXT: v_alignbit_b32 v2, s6, v1, s7 -; GFX10-NEXT: v_alignbit_b32 v1, s5, v5, s9 -; GFX10-NEXT: v_alignbit_b32 v0, s4, v6, s8 +; GFX10-NEXT: v_alignbit_b32 v1, s2, v0, s3 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 1 +; GFX10-NEXT: s_lshr_b32 s2, s4, 1 +; GFX10-NEXT: s_not_b32 s3, s12 +; GFX10-NEXT: v_alignbit_b32 v0, s2, v0, s3 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/fshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fshr.ll +++ llvm/test/CodeGen/AMDGPU/fshr.ll @@ -448,21 +448,21 @@ ; ; GFX10-LABEL: fshr_v4i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s15 ; GFX10-NEXT: v_mov_b32_e32 v1, s14 -; GFX10-NEXT: v_mov_b32_e32 v4, s13 -; GFX10-NEXT: v_mov_b32_e32 v5, s12 ; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s13 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, v1 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, v4 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] +; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, v0 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v4i32: @@ -946,24 +946,24 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v4 -; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v7 +; GFX10-NEXT: v_lshlrev_b16 v6, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshrrev_b16 v7, v7, v8 -; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0 +; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 +; GFX10-NEXT: v_lshlrev_b16 v6, v7, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 -; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_lshrrev_b16 v7, v4, v7 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX10-NEXT: v_lshlrev_b16 v0, v4, v0 ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v5 -; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX10-NEXT: v_lshlrev_b16 v1, v4, v1 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1097,31 +1097,31 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 ; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6 ; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 -; GFX10-NEXT: v_lshlrev_b16 v9, 1, v9 -; GFX10-NEXT: v_xor_b32_e32 v12, -1, v10 -; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_xor_b32_e32 v13, -1, v5 +; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v14, -1, v4 +; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 ; GFX10-NEXT: v_lshlrev_b16 v7, v7, v8 -; GFX10-NEXT: v_lshrrev_b16 v8, v10, v11 -; GFX10-NEXT: v_lshlrev_b16 v9, v12, v9 -; GFX10-NEXT: v_lshlrev_b16 v1, v13, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v14, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 -; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 -; GFX10-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX10-NEXT: v_lshrrev_b16 v8, v9, v8 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v9 +; GFX10-NEXT: v_lshlrev_b16 v9, v9, v10 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5 ; GFX10-NEXT: v_or_b32_e32 v5, v9, v8 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_lshlrev_b16 v1, v10, v1 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v4 +; GFX10-NEXT: v_or_b32_e32 v4, v7, v6 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0 ; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_v4i16: @@ -1207,9 +1207,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: v_not_b32_e32 v5, v4 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX10-NEXT: v_not_b32_e32 v4, v4 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1293,17 +1293,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_not_b32_e32 v9, v8 -; GFX10-NEXT: v_not_b32_e32 v11, v10 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] -; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] -; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX10-NEXT: v_not_b32_e32 v8, v8 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v10, v[6:7] +; GFX10-NEXT: v_not_b32_e32 v6, v10 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] +; GFX10-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_v2i64: @@ -1478,19 +1478,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 -; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 ; GFX10-NEXT: v_mul_u32_u24_e32 v6, 24, v6 -; GFX10-NEXT: v_mul_u32_u24_e32 v7, 24, v7 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 8, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 8, v5 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v5 +; GFX10-NEXT: v_mul_hi_u32 v2, 0xaaaaaab, v2 +; GFX10-NEXT: v_mul_u32_u24_e32 v2, 24, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v5, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, 8, v2 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_v2i24: Index: llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -5537,15 +5537,15 @@ ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[36:37] +; GFX10-NEXT: s_add_u32 s36, s36, external_void_func_v8i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s37, s37, external_void_func_v8i32@rel32@hi+12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[34:35] ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[34:35] offset:16 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[34:35] -; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32@rel32@hi+12 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: v_readlane_b32 s34, v41, 0 @@ -5615,15 +5615,15 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: s_getpc_b64 s[2:3] +; GFX10-SCRATCH-NEXT: s_add_u32 s2, s2, external_void_func_v8i32@rel32@lo+4 +; GFX10-SCRATCH-NEXT: s_addc_u32 s3, s3, external_void_func_v8i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] -; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 -; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 @@ -5856,6 +5856,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v16, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[36:37] +; GFX10-NEXT: s_add_u32 s36, s36, external_void_func_v16i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s37, s37, external_void_func_v16i32@rel32@hi+12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[34:35] @@ -5863,10 +5866,7 @@ ; GFX10-NEXT: global_load_dwordx4 v[8:11], v16, s[34:35] offset:32 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v16, s[34:35] offset:48 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[34:35] -; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v16i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v16i32@rel32@hi+12 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: v_readlane_b32 s34, v41, 0 @@ -5938,6 +5938,9 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: s_getpc_b64 s[2:3] +; GFX10-SCRATCH-NEXT: s_add_u32 s2, s2, external_void_func_v16i32@rel32@lo+4 +; GFX10-SCRATCH-NEXT: s_addc_u32 s3, s3, external_void_func_v16i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x3 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] @@ -5945,10 +5948,7 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] -; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v16i32@rel32@lo+4 -; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 @@ -6026,6 +6026,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[36:37] +; GFX10-NEXT: s_add_u32 s36, s36, external_void_func_v32i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s37, s37, external_void_func_v32i32@rel32@hi+12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[34:35] @@ -6037,10 +6040,7 @@ ; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[34:35] offset:96 ; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[34:35] offset:112 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[34:35] -; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v32i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v32i32@rel32@hi+12 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: v_readlane_b32 s34, v41, 0 @@ -6071,6 +6071,9 @@ ; GFX11-NEXT: v_mov_b32_e32 v28, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v32i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v32i32@rel32@hi+12 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x7 ; GFX11-NEXT: global_load_b128 v[0:3], v28, s[0:1] @@ -6082,11 +6085,8 @@ ; GFX11-NEXT: global_load_b128 v[24:27], v28, s[0:1] offset:96 ; GFX11-NEXT: global_load_b128 v[28:31], v28, s[0:1] offset:112 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -6116,6 +6116,9 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: s_getpc_b64 s[2:3] +; GFX10-SCRATCH-NEXT: s_add_u32 s2, s2, external_void_func_v32i32@rel32@lo+4 +; GFX10-SCRATCH-NEXT: s_addc_u32 s3, s3, external_void_func_v32i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x7 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] @@ -6127,10 +6130,7 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] -; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32@rel32@lo+4 -; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 @@ -7360,19 +7360,19 @@ ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:20 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16 ; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 -; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_writelane_b32 v40, s35, 3 @@ -7455,11 +7455,9 @@ ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:24 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 offset:16 -; GFX11-NEXT: scratch_load_b32 v31, off, s33 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: scratch_load_b64 v[31:32], off, s33 offset:16 ; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 @@ -7494,8 +7492,9 @@ ; GFX11-NEXT: v_writelane_b32 v40, s61, 29 ; GFX11-NEXT: v_writelane_b32 v40, s62, 30 ; GFX11-NEXT: v_writelane_b32 v40, s63, 31 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b64 off, v[31:32], s32 +; GFX11-NEXT: scratch_load_b32 v31, off, s33 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s63, v40, 31 ; GFX11-NEXT: v_readlane_b32 s62, v40, 30 @@ -7547,11 +7546,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:24 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:16 -; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[31:32], off, s33 offset:16 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 @@ -7586,8 +7583,9 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s61, 29 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s62, 30 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s63, 31 -; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32 +; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[31:32], s32 +; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s63, v40, 31 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s62, v40, 30 @@ -13763,24 +13761,24 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-NEXT: v_mov_b32_e32 v2, s48 ; GFX10-NEXT: v_mov_b32_e32 v3, s49 -; GFX10-NEXT: s_mov_b32 s20, s36 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: v_writelane_b32 v40, s24, 20 +; GFX10-NEXT: v_mov_b32_e32 v0, s50 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX10-NEXT: v_mov_b32_e32 v1, s51 +; GFX10-NEXT: s_mov_b32 s20, s36 +; GFX10-NEXT: v_writelane_b32 v40, s25, 21 ; GFX10-NEXT: s_mov_b32 s21, s37 ; GFX10-NEXT: s_mov_b32 s22, s38 ; GFX10-NEXT: s_mov_b32 s23, s39 ; GFX10-NEXT: s_mov_b32 s24, s40 -; GFX10-NEXT: v_writelane_b32 v40, s25, 21 +; GFX10-NEXT: v_writelane_b32 v40, s26, 22 ; GFX10-NEXT: s_mov_b32 s25, s41 -; GFX10-NEXT: v_mov_b32_e32 v4, s50 -; GFX10-NEXT: v_mov_b32_e32 v5, s51 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX10-NEXT: s_mov_b32 s26, s42 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 -; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 -; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 -; GFX10-NEXT: v_writelane_b32 v40, s26, 22 -; GFX10-NEXT: s_mov_b32 s26, s42 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; GFX10-NEXT: v_writelane_b32 v40, s27, 23 ; GFX10-NEXT: s_mov_b32 s27, s43 ; GFX10-NEXT: v_writelane_b32 v40, s28, 24 @@ -13978,25 +13976,25 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s50 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s51 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s49 ; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:16 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37 -; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21 +; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 ; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39 ; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40 ; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s32 offset:16 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22 ; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s27, 23 ; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s28, 24 @@ -14210,30 +14208,30 @@ ; GFX10-NEXT: v_writelane_b32 v40, s22, 18 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s52 -; GFX10-NEXT: v_mov_b32_e32 v1, s47 +; GFX10-NEXT: v_mov_b32_e32 v1, s46 ; GFX10-NEXT: v_writelane_b32 v40, s23, 19 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 -; GFX10-NEXT: v_mov_b32_e32 v0, s46 +; GFX10-NEXT: v_mov_b32_e32 v0, s47 ; GFX10-NEXT: v_mov_b32_e32 v2, s48 ; GFX10-NEXT: v_mov_b32_e32 v3, s49 ; GFX10-NEXT: v_writelane_b32 v40, s24, 20 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX10-NEXT: v_mov_b32_e32 v1, s50 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX10-NEXT: v_mov_b32_e32 v0, s51 +; GFX10-NEXT: v_writelane_b32 v40, s25, 21 ; GFX10-NEXT: s_mov_b32 s20, s36 ; GFX10-NEXT: s_mov_b32 s21, s37 ; GFX10-NEXT: s_mov_b32 s22, s38 ; GFX10-NEXT: s_mov_b32 s23, s39 -; GFX10-NEXT: v_writelane_b32 v40, s25, 21 +; GFX10-NEXT: v_writelane_b32 v40, s26, 22 ; GFX10-NEXT: s_mov_b32 s24, s40 ; GFX10-NEXT: s_mov_b32 s25, s41 -; GFX10-NEXT: v_mov_b32_e32 v4, s50 -; GFX10-NEXT: v_mov_b32_e32 v5, s51 -; GFX10-NEXT: v_writelane_b32 v40, s26, 22 ; GFX10-NEXT: s_mov_b32 s26, s42 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 -; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 -; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX10-NEXT: v_writelane_b32 v40, s27, 23 ; GFX10-NEXT: s_mov_b32 s27, s43 ; GFX10-NEXT: v_writelane_b32 v40, s28, 24 @@ -14437,27 +14435,27 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, s2 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s50 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s51 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s32 offset:24 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s49 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21 ; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36 ; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37 ; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21 ; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22 ; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40 ; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v6, s32 offset:24 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s32 offset:16 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22 ; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s27, 23 ; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s28, 24 @@ -14560,20 +14558,19 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, stack_passed_f64_arg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, stack_passed_f64_arg@rel32@hi+12 -; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -15038,20 +15035,20 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 10 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_mov_b32_e32 v3, 14 +; GFX10-NEXT: v_mov_b32_e32 v3, 11 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; GFX10-NEXT: v_mov_b32_e32 v0, 11 -; GFX10-NEXT: v_mov_b32_e32 v1, 12 -; GFX10-NEXT: v_mov_b32_e32 v2, 13 -; GFX10-NEXT: v_mov_b32_e32 v4, 15 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; GFX10-NEXT: v_mov_b32_e32 v0, 12 +; GFX10-NEXT: v_mov_b32_e32 v1, 13 +; GFX10-NEXT: v_mov_b32_e32 v2, 14 +; GFX10-NEXT: v_mov_b32_e32 v3, 15 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 -; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 -; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 -; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:28 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -15175,24 +15172,24 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 8 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 9 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 10 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 11 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 1 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 8 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 9 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 10 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 11 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v9, 1 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v10, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v11, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v12, 2 @@ -15340,20 +15337,20 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41100000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x41300000 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x41300000 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x41400000 -; GFX10-NEXT: v_mov_b32_e32 v2, 0x41500000 -; GFX10-NEXT: v_mov_b32_e32 v4, 0x41700000 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x41400000 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x41500000 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x41600000 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x41700000 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 -; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 -; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 -; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:28 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -15483,24 +15480,24 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41500000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41600000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x41700000 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0x41000000 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x41100000 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41200000 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41300000 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41000000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41100000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41200000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x41300000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v9, 1.0 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v10, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v11, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v12, 2.0 Index: llvm/test/CodeGen/AMDGPU/idiv-licm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -668,29 +668,29 @@ ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s0, 0xffff, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB4_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_and_b32 s0, 0xffff, s4 -; GFX10-NEXT: v_add_nc_u16 v3, s4, 1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[0:1], 1 -; GFX10-NEXT: s_add_u32 s6, s2, s4 -; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_add_nc_u16 v3, s1, 1 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s1 +; GFX10-NEXT: s_lshl_b64 s[6:7], s[4:5], 1 +; GFX10-NEXT: s_add_u32 s6, s2, s6 +; GFX10-NEXT: v_readfirstlane_b32 s1, v3 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v1 -; GFX10-NEXT: s_addc_u32 s7, s3, s5 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX10-NEXT: s_addc_u32 s7, s3, s7 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX10-NEXT: v_trunc_f32_e32 v3, v3 -; GFX10-NEXT: v_mad_f32 v4, -v3, v0, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, 0, v3, s0 +; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX10-NEXT: v_trunc_f32_e32 v4, v4 +; GFX10-NEXT: v_mad_f32 v3, -v4, v0, v3 +; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v3|, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, 0, v4, s0 ; GFX10-NEXT: global_store_short v2, v3, s[6:7] ; GFX10-NEXT: s_cbranch_vccz .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 @@ -796,8 +796,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s1, 0xffff, s4 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX10-NEXT: .LBB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -806,15 +806,15 @@ ; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] ; GFX10-NEXT: v_mul_f32_e32 v8, v7, v3 -; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, s2, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s3, v6, vcc_lo ; GFX10-NEXT: v_trunc_f32_e32 v8, v8 ; GFX10-NEXT: v_mad_f32 v7, -v8, v2, v7 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 -; GFX10-NEXT: v_mul_lo_u32 v7, v7, s1 +; GFX10-NEXT: v_mul_lo_u32 v7, v7, s0 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v7 ; GFX10-NEXT: global_store_short v[5:6], v0, off ; GFX10-NEXT: s_cbranch_vccz .LBB5_1 @@ -934,22 +934,22 @@ ; GFX10-NEXT: .LBB6_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_sext_i32_i16 s0, s5 -; GFX10-NEXT: v_add_nc_u16 v3, s5, 1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GFX10-NEXT: s_xor_b32 s0, s0, s4 ; GFX10-NEXT: s_ashr_i32 s0, s0, 30 -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 ; GFX10-NEXT: s_or_b32 s0, s0, 1 -; GFX10-NEXT: v_trunc_f32_e32 v5, v5 -; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 -; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v4|, |v0| -; GFX10-NEXT: v_cvt_i32_f32_e32 v4, v5 +; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX10-NEXT: v_trunc_f32_e32 v4, v4 +; GFX10-NEXT: v_mad_f32 v3, -v4, v0, v3 +; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v3|, |v0| +; GFX10-NEXT: v_add_nc_u16 v3, s5, 1 ; GFX10-NEXT: s_and_b32 s6, s6, exec_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 ; GFX10-NEXT: s_cselect_b32 s6, s0, 0 ; GFX10-NEXT: s_and_b32 s0, s5, 0xffff ; GFX10-NEXT: v_readfirstlane_b32 s5, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v3, s6, v4 +; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, s6, v3 ; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 ; GFX10-NEXT: s_add_u32 s6, s2, s6 ; GFX10-NEXT: s_addc_u32 s7, s3, s7 @@ -1078,26 +1078,26 @@ ; GFX10-NEXT: .LBB7_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_sext_i32_i16 s8, s5 +; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s8 +; GFX10-NEXT: s_xor_b32 s6, s8, s4 +; GFX10-NEXT: s_ashr_i32 s6, s6, 30 +; GFX10-NEXT: s_or_b32 s6, s6, 1 +; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX10-NEXT: v_trunc_f32_e32 v4, v4 +; GFX10-NEXT: v_mad_f32 v3, -v4, v0, v3 +; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v3|, |v0| ; GFX10-NEXT: v_add_nc_u16 v3, s5, 1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v4, s8 -; GFX10-NEXT: s_xor_b32 s0, s8, s4 -; GFX10-NEXT: s_ashr_i32 s0, s0, 30 +; GFX10-NEXT: s_and_b32 s0, s0, exec_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX10-NEXT: s_or_b32 s0, s0, 1 -; GFX10-NEXT: v_trunc_f32_e32 v5, v5 -; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 -; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v4|, |v0| -; GFX10-NEXT: v_cvt_i32_f32_e32 v4, v5 -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s6, s0, 0 +; GFX10-NEXT: s_cselect_b32 s6, s6, 0 ; GFX10-NEXT: s_and_b32 s0, s5, 0xffff -; GFX10-NEXT: v_add_nc_u32_e32 v4, s6, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v3 +; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, s6, v3 ; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 ; GFX10-NEXT: s_add_u32 s6, s2, s6 -; GFX10-NEXT: v_mul_lo_u32 v3, v4, s4 ; GFX10-NEXT: s_addc_u32 s7, s3, s7 +; GFX10-NEXT: v_mul_lo_u32 v3, v3, s4 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8, v3 ; GFX10-NEXT: global_store_short v2, v3, s[6:7] ; GFX10-NEXT: s_cbranch_vccz .LBB7_1 Index: llvm/test/CodeGen/AMDGPU/idot4s.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot4s.ll +++ llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -329,34 +329,34 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-DL-NEXT: v_bfe_i32 v0, v2, 0, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX10-DL-NEXT: v_bfe_i32 v7, v2, 0, 8 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX10-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v6, v6, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v5, v3, 0, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 -; GFX10-DL-NEXT: v_bfe_i32 v4, v8, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v7, v9, 0, 8 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; GFX10-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 -; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX10-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 -; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] +; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 +; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -532,18 +532,17 @@ ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] -; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v4, v2, v3, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -717,14 +716,14 @@ ; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_bfe_i32 v3, v2, 0, 8 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_mul_i32_i24_e32 v5, v0, v3 +; GFX10-DL-NEXT: v_mul_i32_i24_e32 v4, v0, v3 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v3, s2 +; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_add3_u32 v0, v3, v0, v4 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v4, v0, v5 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -906,15 +905,15 @@ ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b16 v0, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16 v3, 8, v2 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_lshrrev_b16 v2, 8, v2 +; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_add3_u32 v0, v4, s2, v0 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 +; GFX10-DL-NEXT: v_add3_u32 v0, v3, s2, v0 +; GFX10-DL-NEXT: v_add3_u32 v0, v0, v4, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, @@ -1108,36 +1107,36 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_ashrrev_i16 v4, 8, v1 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_ashrrev_i16 v5, 8, v2 -; GFX10-DL-NEXT: v_bfe_i32 v6, v2, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v7, v1, 0, 8 +; GFX10-DL-NEXT: v_ashrrev_i16 v0, 8, v1 +; GFX10-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_bfe_i32 v4, v2, 0, 8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-DL-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; GFX10-DL-NEXT: v_ashrrev_i16 v3, 8, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-DL-NEXT: v_perm_b32 v5, v5, v6, 0x5040100 -; GFX10-DL-NEXT: v_perm_b32 v4, v4, v7, 0x5040100 -; GFX10-DL-NEXT: v_ashrrev_i16 v6, 8, v1 -; GFX10-DL-NEXT: v_ashrrev_i16 v7, 8, v2 -; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX10-DL-NEXT: v_perm_b32 v3, v3, v4, 0x5040100 +; GFX10-DL-NEXT: v_ashrrev_i16 v4, 8, v1 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_perm_b32 v2, v7, v2, 0x5040100 -; GFX10-DL-NEXT: v_perm_b32 v1, v6, v1, 0x5040100 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v0, v3, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-DL-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX10-DL-NEXT: v_ashrrev_i16 v4, 8, v2 +; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX10-DL-NEXT: v_perm_b32 v2, v4, v2, 0x5040100 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v2, v1 +; GFX10-DL-NEXT: global_load_ushort v2, v3, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DL-NEXT: v_add_nc_u16 v2, v0, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-DL-NEXT: v_add_nc_u16 v0, v2, v0 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2 +; GFX10-DL-NEXT: global_store_short v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { Index: llvm/test/CodeGen/AMDGPU/idot4u.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot4u.ll +++ llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -311,31 +311,31 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xff +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 -; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 -; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX10-DL-NEXT: v_mad_u16 v0, v5, v4, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 +; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -512,18 +512,17 @@ ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] -; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v4, v2, v3, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -671,13 +670,11 @@ ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] -; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 -; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v2, v2, v3, v4 -; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v2 +; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v3 +; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, @@ -836,18 +833,17 @@ ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] -; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v4, v3, v2, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX10-DL-NEXT: v_mad_u16 v0, v5, v4, v0 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v5, v0, v4 -; GFX10-DL-NEXT: v_mad_u16 v0, v7, v6, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v5, v4, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -1205,14 +1201,14 @@ ; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v0, v3 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v0, v3 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2 +; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_add3_u32 v0, v3, v0, v4 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v4, v0, v5 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -1398,15 +1394,15 @@ ; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 8, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s2, v0 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v4, v3 +; GFX10-DL-NEXT: v_add3_u32 v2, v0, v3, v4 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2 +; GFX10-DL-NEXT: v_add3_u32 v0, v2, v1, v0 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, @@ -1592,35 +1588,34 @@ ; ; GFX10-DL-LABEL: notdot4_mixedtypes: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v7, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ushort v5, v4, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v8, v2, 0, 8 -; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v3, v5 +; GFX10-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v5, v2, 0, 8 +; GFX10-DL-NEXT: v_mad_u16 v0, v3, v5, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xff +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-NEXT: v_mad_u16 v3, v6, v8, v3 -; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 -; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] +; GFX10-DL-NEXT: v_mad_u16 v0, v5, v3, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v1, v2, v0 +; GFX10-DL-NEXT: global_store_short v4, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1794,13 +1789,13 @@ ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v3, v0 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_add3_u32 v0, v4, s2, v0 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 +; GFX10-DL-NEXT: v_add3_u32 v0, v0, v5, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, @@ -1983,38 +1978,38 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xff ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b16 v4, 8, v1 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v0, v0, s[4:5] ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xff, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v1 -; GFX10-DL-NEXT: v_perm_b32 v5, v5, v6, 0x5040100 -; GFX10-DL-NEXT: v_perm_b32 v4, v4, v7, 0x5040100 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_perm_b32 v2, v7, v2, 0x5040100 -; GFX10-DL-NEXT: v_perm_b32 v1, v6, v1, 0x5040100 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_perm_b32 v3, v3, v4, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-DL-NEXT: v_perm_b32 v2, v4, v2, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b16 v4, 8, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-DL-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b16 v4, 8, v0 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-DL-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX10-DL-NEXT: global_load_ushort v1, v3, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DL-NEXT: v_add_nc_u16 v1, v0, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-DL-NEXT: v_add_nc_u16 v0, v1, v0 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2 +; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-DL-NEXT: global_store_short v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2182,31 +2177,31 @@ ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX10-DL-NEXT: v_lshrrev_b16 v9, 8, v2 -; GFX10-DL-NEXT: v_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v1 +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX10-DL-NEXT: v_lshrrev_b16 v4, 8, v2 +; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v3 +; GFX10-DL-NEXT: v_lshrrev_b16 v3, 8, v1 +; GFX10-DL-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-DL-NEXT: v_mul_lo_u16 v3, v3, v4 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-DL-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX10-DL-NEXT: global_load_ubyte v5, v4, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_mad_u16 v5, v1, v2, v5 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX10-DL-NEXT: v_mul_lo_u16 v6, v1, v2 +; GFX10-DL-NEXT: v_or_b32_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v3 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 -; GFX10-DL-NEXT: v_mul_lo_u16 v5, v7, v8 -; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v9 -; GFX10-DL-NEXT: v_lshlrev_b16 v4, 8, v4 -; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4 -; GFX10-DL-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5 -; GFX10-DL-NEXT: v_mad_u16 v1, v7, v8, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2 -; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-DL-NEXT: v_add_nc_u16 v0, v1, v0 +; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { Index: llvm/test/CodeGen/AMDGPU/idot8s.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot8s.ll +++ llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -622,6 +622,7 @@ ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 @@ -632,66 +633,65 @@ ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX10-DL-XNACK-NEXT: global_load_ushort v5, v4, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v0, v3, v5 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 4, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v3 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1 -; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v3, v5, v0 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v3, v5, v0 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v3, v5, v0 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v3, v5, v0 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 20, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v3, v5, v0 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v3, v5, v0 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v1, v2, v0 +; GFX10-DL-XNACK-NEXT: global_store_short v4, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc16: @@ -713,62 +713,62 @@ ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 4, v1 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 4, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v3, v6, v9, v3 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v3, v7, v8, v3 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 28, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v3, v5, v10, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 20, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 -; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v11, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v7, v5, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v9, v0, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v4 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v5, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v4, v0 ; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc16: @@ -1194,6 +1194,7 @@ ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 @@ -1204,66 +1205,65 @@ ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX10-DL-XNACK-NEXT: global_load_ubyte v5, v4, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v0, v3, v5 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 4, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v3 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v3, v5, v0 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v3, v5, v0 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1 -; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v3, v5, v0 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v3, v5, v0 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 20, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v3, v5, v0 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v3, v5, v0 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v1, v2, v0 +; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc8: @@ -1285,62 +1285,62 @@ ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v3, v2, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 4, v1 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 4, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v3, v6, v9, v3 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v3, v7, v8, v3 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 28, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v3, v5, v10, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 20, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 -; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v11, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v7, v5, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v9, v0, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v4 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v5, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v4, v0 ; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc8: @@ -1696,38 +1696,38 @@ ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v0, v1, 0, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v1, 4, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v0, v1, 4, 4 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v2, 4, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v5, v1, 8, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v2, 8, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v2, 0, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v8, v1, 12, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v2, 4, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v2, 8, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v5, v2, 0, 4 +; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v0, v0, v3 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v1, 8, 4 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4 -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v9, v2, 12, 4 -; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v1, 0, 4 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s2 -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v1, 16, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v10, v2, 16, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v11, v1, 20, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v12, v2, 20, 4 -; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v0, v0, v7, v5 -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v1, 24, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v13, v2, 24, 4 -; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9 -; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v6, v4, v5, s2 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v4, v4, v5, v6 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v5, v2, 16, 4 +; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v4, v0, v3 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v1, 12, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v2, 12, 4 +; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v1, 16, 4 +; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v4, v5 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v5, v2, 24, 4 ; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4 -; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12 -; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v1, 20, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v2, 20, 4 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v2, 28, v2 -; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v8, v6 +; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v1, 24, 4 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1 +; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v4, v5 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v1, v1, v2 -; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v5 +; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4 +; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v6 ; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm ; @@ -1749,37 +1749,37 @@ ; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v2, v1, 0, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v1, 4, 4 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v4, v0, 4, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v5, v1, 8, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v0, 8, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v0, 0, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v0, 0, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v4, v1, 4, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v5, v0, 4, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 8, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v0, 8, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v8, v1, 12, 4 -; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v9, v0, 12, 4 -; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6 +; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v4, v5 +; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v1, 16, 4 +; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v5, v6, v7 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s2 -; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 16, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v0, 16, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v1, 20, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v12, v0, 20, 4 -; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, v2, v7, v5 -; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v1, 24, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v13, v0, 24, 4 -; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9 -; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10 -; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v3, v4 -; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12 -; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v6, v2, v3, s2 +; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v0, 16, 4 +; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v7, v8, v9 +; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v9, v1, 20, 4 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, v2, v3, v6 +; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v8, v10, v11 +; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v0, 20, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v1, 24, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v0, 24, 4 +; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v4, v5 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1 +; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v9, v10 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v0, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v8, v6 +; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v11 +; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v7, v8 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v0, v1, v0 -; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v2, v3, v4 +; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v2, v4, v3 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v5 +; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v6 ; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_multiuses_mul1: @@ -2514,84 +2514,84 @@ ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 +; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-DL-XNACK-NEXT: global_load_dword v0, v0, s[4:5] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 28, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 20, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 12, v1 +; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v0 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 8, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v0 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v0 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v3 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 -; GFX10-DL-XNACK-NEXT: v_perm_b32 v11, v12, v11, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 ; GFX10-DL-XNACK-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 4, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v1, v6, v1, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 4, v0 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v6, v6, v7, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v6, v1 +; GFX10-DL-XNACK-NEXT: global_load_ushort v6, v5, s[0:1] +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v8 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v13 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v14 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 -; GFX10-DL-XNACK-NEXT: v_perm_b32 v8, v12, v8, 0x5040100 -; GFX10-DL-XNACK-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v3, v7, v3 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 24, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v15 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v16 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v6, v6, v8 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v17 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_perm_b32 v4, v4, v11, 0x5040100 -; GFX10-DL-XNACK-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v6 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v12 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 -; GFX10-DL-XNACK-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 -; GFX10-DL-XNACK-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v4 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v2, v3, v5 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v2, v1 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v6, v1, v6 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v6, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v0, 28, v0 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v4 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v4 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v0, v0, v6, 0x5040100 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3 -; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v1, v0 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v2 +; GFX10-DL-XNACK-NEXT: global_store_short v5, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul: @@ -2608,79 +2608,79 @@ ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[4:5] ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 12, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 4, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 8, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v11, v12, v11, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v13 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 -; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v8, v12, v8, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v8, v9, v8, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v0 ; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v5, v11, v5, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v8, v4 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v6, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 24, v0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 24, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v15 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v5, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v16 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v6, v6, v8 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v17 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v4, v4, v11, 0x5040100 -; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 -; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v0, v0, v6, 0x5040100 -; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v5, v8, v7, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v7, v9, v10, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v4 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v3, v5 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v7, v5 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v1, v1, v4, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v0, v0, v11, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v5 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v3, v4 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3 @@ -3154,7 +3154,6 @@ ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 @@ -3165,88 +3164,89 @@ ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1] -; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v1 +; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 20, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 28, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v0, v0, v3 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 12, v3 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 4, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v3 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v3, v3, v4 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v0, 20, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v8, v8, v15 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v17 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v9, v16 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 8, v8 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v4, v4, v5 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 8, v4 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v5, v5, v6 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 8, v5 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v10, v15 -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v7 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 4, v1 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v0, v11 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v5, v12 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v11, v7, v14 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 8, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v13 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v7, v7, v8 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v7 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v8, v9 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: global_load_ubyte v10, v7, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v1, v3 -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v9, v3, v10 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v10, v0, v10 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v0, 24, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v0 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v1 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v0, v11, v12 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v2, v10, v2 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v5, v2, v5 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v9, v8 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v5, v12, v0 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v5, v2 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v8, v9, v0 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v7, v14, v0 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v11, v12, v0 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[0:1] +; GFX10-DL-XNACK-NEXT: global_store_byte v7, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul: @@ -3254,7 +3254,6 @@ ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 @@ -3265,88 +3264,89 @@ ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] -; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1] -; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v1 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v2, 20, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 20, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v2, 12, v2 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v4 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v8, v8, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v3 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v2, 12, v2 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v17 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v9, v16 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v15 -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v2, v2, v10 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v0 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v4, v7, v4 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v11 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 8, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v5, v12 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v11, v7, v14 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v7, v8, v7 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v8, v5, v10 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v9, v11 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v13 +; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v8, v11, s[0:1] +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 8, v4 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 8, v9 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v0, v12, v4 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v2, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 8, v3 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v2, v1, v2 -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v9, v2, v10 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v2, v2, v8 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v8, v2, v3 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v9, v8 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v8, v7 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v12, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v10, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v12, v4, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[0:1] +; GFX10-DL-NOXNACK-NEXT: global_store_byte v11, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry Index: llvm/test/CodeGen/AMDGPU/idot8u.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot8u.ll +++ llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -497,27 +497,27 @@ ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 4, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 12, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4 +; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 20, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] @@ -815,27 +815,27 @@ ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 4, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 12, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4 +; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 20, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] @@ -1137,27 +1137,27 @@ ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 4, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 12, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4 +; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 20, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 @@ -1444,27 +1444,27 @@ ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 4, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 12, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4 +; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 20, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 @@ -1754,39 +1754,39 @@ ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v1 +; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 8, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v1 -; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v3, v2, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 +; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 4, 4 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 +; GFX10-DL-NEXT: v_bfe_u32 v3, v1, 12, 4 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v3, v3, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s2 -; GFX10-DL-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v11 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v12 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v10, v13 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 -; GFX10-DL-NEXT: v_bfe_u32 v2, v2, 24, 4 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v15 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v14 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v7 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v4, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v10 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v3, v8, v9 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v6, v5 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v3, v13, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: v_mad_u32_u24 v8, v4, v5, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v6, v6, v7, v8 +; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 20, 4 +; GFX10-DL-NEXT: v_add3_u32 v0, v6, v0, v3 +; GFX10-DL-NEXT: v_bfe_u32 v3, v1, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v3, v3, v6 +; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v7 +; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v6 +; GFX10-DL-NEXT: v_bfe_u32 v3, v1, 24, 4 +; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 24, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v3, v3, v6 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v4, v5 +; GFX10-DL-NEXT: v_add3_u32 v0, v1, v8, v0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2334,54 +2334,54 @@ ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v0, v0, s[4:5] ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 +; GFX10-DL-NEXT: v_bfe_u32 v2, v1, 24, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v1 +; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 20, 4 +; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 12, 4 +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_bfe_u32 v6, v0, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v0, 4, 4 +; GFX10-DL-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v3, v1, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v0, 20, 4 +; GFX10-DL-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 8, 4 +; GFX10-DL-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v5, v0, 8, 4 ; GFX10-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 -; GFX10-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 8, 4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 4, 4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX10-DL-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-DL-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v0 +; GFX10-DL-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v7, v0, 16, 4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v6, v1 +; GFX10-DL-NEXT: global_load_ushort v6, v5, s[0:1] ; GFX10-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 20, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v7, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 -; GFX10-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 24, 4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 24, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX10-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 +; GFX10-DL-NEXT: v_add_nc_u16 v6, v1, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-DL-NEXT: v_add_nc_u16 v1, v6, v1 +; GFX10-DL-NEXT: v_bfe_u32 v6, v0, 24, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v0 +; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX10-DL-NEXT: v_perm_b32 v0, v0, v6, 0x5040100 +; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-DL-NEXT: v_add_nc_u16 v0, v1, v0 +; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2 +; GFX10-DL-NEXT: global_store_short v5, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2685,7 +2685,6 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s10, -1 @@ -2696,58 +2695,59 @@ ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1] -; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 -; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v1, v2, 8, 4 -; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v9 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 20, 4 -; GFX10-DL-NEXT: v_mul_lo_u16 v1, v7, v1 -; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v2 -; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v9 -; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v16, v2, 24, 4 -; GFX10-DL-NEXT: v_or_b32_e32 v6, v1, v6 -; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v15 -; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v14 -; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v0 -; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v13 -; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-DL-NEXT: v_mul_lo_u16 v1, v12, v7 -; GFX10-DL-NEXT: v_mul_lo_u16 v11, v10, v16 -; GFX10-DL-NEXT: v_lshlrev_b16 v2, 8, v2 -; GFX10-DL-NEXT: v_lshlrev_b16 v8, 8, v8 -; GFX10-DL-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_e32 v5, v5, v9 -; GFX10-DL-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX10-DL-NEXT: v_or_b32_sdwa v2, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v13 +; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 20, 4 +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_bfe_u32 v3, v2, 20, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v2 +; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v2 +; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v1 +; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 24, 4 +; GFX10-DL-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-DL-NEXT: v_mul_lo_u16 v3, v3, v4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 8, 4 +; GFX10-DL-NEXT: v_lshlrev_b16 v12, 8, v3 +; GFX10-DL-NEXT: v_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 12, 4 +; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v6 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX10-DL-NEXT: v_lshlrev_b16 v5, 8, v5 +; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v7 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 +; GFX10-DL-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v8 +; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 +; GFX10-DL-NEXT: v_lshlrev_b16 v5, 8, v7 +; GFX10-DL-NEXT: v_mul_lo_u16 v10, v8, v9 +; GFX10-DL-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-DL-NEXT: v_or_b32_e32 v6, v6, v5 +; GFX10-DL-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX10-DL-NEXT: global_load_ubyte v10, v7, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 -; GFX10-DL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_add_nc_u16 v5, v3, v9 +; GFX10-DL-NEXT: v_add_nc_u16 v6, v6, v10 +; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 +; GFX10-DL-NEXT: v_mul_lo_u16 v1, v10, v11 +; GFX10-DL-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX10-DL-NEXT: v_or_b32_sdwa v2, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX10-DL-NEXT: v_add_nc_u16 v2, v6, v2 +; GFX10-DL-NEXT: v_add_nc_u16 v4, v2, v4 ; GFX10-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v0, v5, v6 -; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-NEXT: v_mad_u16 v0, v12, v7, v0 +; GFX10-DL-NEXT: v_add_nc_u16 v0, v4, v2 +; GFX10-DL-NEXT: v_mad_u16 v0, v8, v9, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX10-DL-NEXT: v_mad_u16 v0, v10, v16, v0 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v12 +; GFX10-DL-NEXT: v_mad_u16 v0, v10, v11, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v7, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -3027,55 +3027,55 @@ ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v0, v0, s[4:5] ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 +; GFX10-DL-NEXT: v_bfe_u32 v2, v1, 24, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v1 +; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 20, 4 +; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 12, 4 +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_bfe_u32 v6, v0, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v0, 4, 4 +; GFX10-DL-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v3, v1, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v0, 20, 4 +; GFX10-DL-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 8, 4 +; GFX10-DL-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v5, v0, 8, 4 ; GFX10-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 -; GFX10-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 8, 4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 4, 4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX10-DL-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-DL-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v0 +; GFX10-DL-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v7, v0, 16, 4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v6, v1 +; GFX10-DL-NEXT: global_load_ubyte v6, v5, s[0:1] ; GFX10-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 20, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v7, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 -; GFX10-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 24, 4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 24, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX10-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 +; GFX10-DL-NEXT: v_add_nc_u16 v6, v1, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-DL-NEXT: v_add_nc_u16 v1, v6, v1 +; GFX10-DL-NEXT: v_bfe_u32 v6, v0, 24, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v0 +; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX10-DL-NEXT: v_perm_b32 v0, v0, v6, 0x5040100 +; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-DL-NEXT: v_add_nc_u16 v0, v1, v0 +; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX10-DL-NEXT: global_store_byte v5, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { Index: llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -3089,24 +3089,26 @@ ; GFX11-NEXT: s_cmp_eq_u32 s1, 7 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_cndmask_b32_e64 v9, v3, s0, s2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 5 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s3 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 5 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 2 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s0, s2 +; GFX11-NEXT: v_perm_b32 v2, v10, v2, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v11, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 @@ -3114,42 +3116,40 @@ ; GFX11-NEXT: s_cmp_eq_u32 s1, 14 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s0, s3 -; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v12, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 15 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 12 ; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; GFX11-NEXT: v_perm_b32 v2, v10, v2, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v13, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v13, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 13 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 10 ; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v14, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v14, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 11 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 8 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v15, s0, s2 +; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v14, v15, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 9 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s0, s2 ; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: v_perm_b32 v7, v10, v7, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v16, s0, s1 -; GFX11-NEXT: v_perm_b32 v6, v12, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v14, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v12, v7, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, s0, s1 +; GFX11-NEXT: v_perm_b32 v6, v13, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v14, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v10, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v11, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] Index: llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll +++ llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll @@ -724,30 +724,29 @@ ; ; GFX10-LABEL: lds_ds_fmin_f64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s10, -1 ; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10-NEXT: s_add_u32 s8, s8, s3 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_mov_b32 s3, 0x40450000 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: s_addc_u32 s9, s9, 0 -; GFX10-NEXT: s_mov_b32 s1, 0x40450000 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl_b32 s5, s4, 3 -; GFX10-NEXT: s_lshl_b32 s0, s4, 4 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: s_lshl_b32 s2, s4, 3 +; GFX10-NEXT: s_lshl_b32 s3, s4, 4 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, s3 ; GFX10-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 ; GFX10-NEXT: ds_min_f64 v4, v[0:1] offset:64 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 ; GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen @@ -888,31 +887,30 @@ ; ; G_GFX10-LABEL: lds_ds_fmin_f64: ; G_GFX10: ; %bb.0: +; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; G_GFX10-NEXT: s_mov_b32 s10, -1 ; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000 ; G_GFX10-NEXT: s_add_u32 s8, s8, s3 -; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; G_GFX10-NEXT: s_mov_b32 s2, 0 ; G_GFX10-NEXT: s_addc_u32 s9, s9, 0 -; G_GFX10-NEXT: s_mov_b32 s0, 0 -; G_GFX10-NEXT: s_mov_b32 s1, 0x40450000 -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: s_mov_b32 s3, 0x40450000 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: s_add_i32 s4, s4, 4 -; G_GFX10-NEXT: v_mov_b32_e32 v5, s3 -; G_GFX10-NEXT: s_lshl_b32 s5, s4, 3 -; G_GFX10-NEXT: s_lshl_b32 s0, s4, 4 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s5 -; G_GFX10-NEXT: v_mov_b32_e32 v4, s0 +; G_GFX10-NEXT: s_lshl_b32 s2, s4, 3 +; G_GFX10-NEXT: s_lshl_b32 s3, s4, 4 +; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v4, s3 ; G_GFX10-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] ; G_GFX10-NEXT: ds_min_f64 v4, v[0:1] +; G_GFX10-NEXT: v_mov_b32_e32 v0, s1 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX10-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3] -; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 +; G_GFX10-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] +; G_GFX10-NEXT: v_mov_b32_e32 v2, s0 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen ; G_GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 @@ -1064,30 +1062,29 @@ ; ; GFX10-LABEL: lds_ds_fmax_f64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s10, -1 ; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10-NEXT: s_add_u32 s8, s8, s3 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_mov_b32 s3, 0x40450000 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: s_addc_u32 s9, s9, 0 -; GFX10-NEXT: s_mov_b32 s1, 0x40450000 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl_b32 s5, s4, 3 -; GFX10-NEXT: s_lshl_b32 s0, s4, 4 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: s_lshl_b32 s2, s4, 3 +; GFX10-NEXT: s_lshl_b32 s3, s4, 4 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, s3 ; GFX10-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 ; GFX10-NEXT: ds_max_f64 v4, v[0:1] offset:64 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 ; GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen @@ -1228,31 +1225,30 @@ ; ; G_GFX10-LABEL: lds_ds_fmax_f64: ; G_GFX10: ; %bb.0: +; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; G_GFX10-NEXT: s_mov_b32 s10, -1 ; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000 ; G_GFX10-NEXT: s_add_u32 s8, s8, s3 -; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; G_GFX10-NEXT: s_mov_b32 s2, 0 ; G_GFX10-NEXT: s_addc_u32 s9, s9, 0 -; G_GFX10-NEXT: s_mov_b32 s0, 0 -; G_GFX10-NEXT: s_mov_b32 s1, 0x40450000 -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: s_mov_b32 s3, 0x40450000 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: s_add_i32 s4, s4, 4 -; G_GFX10-NEXT: v_mov_b32_e32 v5, s3 -; G_GFX10-NEXT: s_lshl_b32 s5, s4, 3 -; G_GFX10-NEXT: s_lshl_b32 s0, s4, 4 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s5 -; G_GFX10-NEXT: v_mov_b32_e32 v4, s0 +; G_GFX10-NEXT: s_lshl_b32 s2, s4, 3 +; G_GFX10-NEXT: s_lshl_b32 s3, s4, 4 +; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v4, s3 ; G_GFX10-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] ; G_GFX10-NEXT: ds_max_f64 v4, v[0:1] +; G_GFX10-NEXT: v_mov_b32_e32 v0, s1 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX10-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3] -; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 +; G_GFX10-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] +; G_GFX10-NEXT: v_mov_b32_e32 v2, s0 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen ; G_GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -572,18 +572,6 @@ ; GFX9-NEXT: image_sample_d v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_d_3d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v12, v8 -; GFX10-NEXT: v_mov_b32_e32 v10, v5 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_perm_b32 v11, v7, v6, 0x5040100 -; GFX10-NEXT: v_perm_b32 v9, v4, v3, 0x5040100 -; GFX10-NEXT: v_perm_b32 v7, v1, v0, 0x5040100 -; GFX10-NEXT: image_sample_d_g16 v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -891,18 +879,6 @@ ; GFX9-NEXT: image_sample_c_d_o v0, v[8:13], s[0:7], s[8:11] dmask:0x4 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_c_d_o_2darray_V1: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v13, v8 -; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_perm_b32 v12, v7, v6, 0x5040100 -; GFX10-NEXT: v_perm_b32 v11, v5, v4, 0x5040100 -; GFX10-NEXT: v_perm_b32 v10, v3, v2, 0x5040100 -; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[8:13], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret float %v @@ -921,18 +897,6 @@ ; GFX9-NEXT: image_sample_c_d_o v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_c_d_o_2darray_V2: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v13, v8 -; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_perm_b32 v12, v7, v6, 0x5040100 -; GFX10-NEXT: v_perm_b32 v11, v5, v4, 0x5040100 -; GFX10-NEXT: v_perm_b32 v10, v3, v2, 0x5040100 -; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f16(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <2 x float> %v Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll @@ -43,15 +43,10 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, half %s, half %t, half %r) { ; GFX10-LABEL: sample_d_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v15, v8 -; GFX10-NEXT: v_mov_b32_e32 v13, v5 -; GFX10-NEXT: v_mov_b32_e32 v12, v4 -; GFX10-NEXT: v_mov_b32_e32 v11, v3 -; GFX10-NEXT: v_mov_b32_e32 v10, v2 -; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_perm_b32 v14, v7, v6, 0x5040100 -; GFX10-NEXT: image_sample_d v[0:3], v[8:15], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 +; GFX10-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX10-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; @@ -126,13 +121,10 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v11, v6 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v7, v1 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_perm_b32 v10, v5, v4, 0x5040100 -; GFX10-NEXT: image_sample_d_cl v[0:3], v[6:11], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 +; GFX10-NEXT: image_sample_d_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; @@ -171,14 +163,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) { ; GFX10-LABEL: sample_c_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v13, v7 -; GFX10-NEXT: v_mov_b32_e32 v11, v4 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, v1 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: v_perm_b32 v12, v6, v5, 0x5040100 -; GFX10-NEXT: image_sample_c_d_cl v[0:3], v[7:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_perm_b32 v5, v8, v5, 0x5040100 +; GFX10-NEXT: image_sample_c_d_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; @@ -289,13 +277,10 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v11, v6 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v7, v1 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_perm_b32 v10, v5, v4, 0x5040100 -; GFX10-NEXT: image_sample_cd_cl v[0:3], v[6:11], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 +; GFX10-NEXT: image_sample_cd_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; @@ -334,14 +319,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) { ; GFX10-LABEL: sample_c_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v13, v7 -; GFX10-NEXT: v_mov_b32_e32 v11, v4 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, v1 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: v_perm_b32 v12, v6, v5, 0x5040100 -; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[7:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_perm_b32 v5, v8, v5, 0x5040100 +; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; @@ -361,15 +342,10 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %slice) { ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v15, v8 -; GFX10-NEXT: v_mov_b32_e32 v13, v5 -; GFX10-NEXT: v_mov_b32_e32 v12, v4 -; GFX10-NEXT: v_mov_b32_e32 v11, v3 -; GFX10-NEXT: v_mov_b32_e32 v10, v2 -; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_perm_b32 v14, v7, v6, 0x5040100 -; GFX10-NEXT: image_sample_c_d_o v0, v[8:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 +; GFX10-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX10-NEXT: image_sample_c_d_o v0, v[0:7], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; @@ -389,15 +365,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %slice) { ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v15, v8 -; GFX10-NEXT: v_mov_b32_e32 v13, v5 -; GFX10-NEXT: v_mov_b32_e32 v12, v4 -; GFX10-NEXT: v_mov_b32_e32 v11, v3 -; GFX10-NEXT: v_mov_b32_e32 v10, v2 -; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_perm_b32 v14, v7, v6, 0x5040100 -; GFX10-NEXT: image_sample_c_d_o v[0:1], v[8:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 +; GFX10-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX10-NEXT: image_sample_c_d_o v[0:1], v[0:7], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -122,8 +122,8 @@ ;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_and: ;CHECK-NEXT: %bb. ;GFX10-NEXT: s_clause -;CHECK-NEXT: buffer_load_{{dwordx4|b128}} v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -;CHECK-NEXT: buffer_load_{{dwordx2|b64}} v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 +;CHECK-DAG: buffer_load_{{dwordx4|b128}} v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +;CHECK-DAG: buffer_load_{{dwordx2|b64}} v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 ;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) { main_body: @@ -148,8 +148,8 @@ ;CHECK-NEXT: %bb. ;CHECK-NEXT: v_lshlrev_b32_e32 v{{[0-9]}}, 6, v0 ;GFX10-NEXT: s_clause -;CHECK-NEXT: buffer_load_{{dwordx4|b128}} v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 -;CHECK-NEXT: buffer_load_{{dwordx2|b64}} v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28 +;CHECK-DAG: buffer_load_{{dwordx4|b128}} v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 +;CHECK-DAG: buffer_load_{{dwordx2|b64}} v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28 ;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) { main_body: @@ -271,9 +271,9 @@ } ;CHECK-LABEL: {{^}}buffer_load_int: -;CHECK: buffer_load_{{dwordx4|b128}} v[0:3], off, s[0:3], 0 -;CHECK: buffer_load_{{dwordx2|b64}} v[4:5], off, s[0:3], 0 glc -;CHECK: buffer_load_{{dword|b32}} v6, off, s[0:3], 0 slc +;CHECK-DAG: buffer_load_{{dwordx4|b128}} v[0:3], off, s[0:3], 0 +;CHECK-DAG: buffer_load_{{dwordx2|b64}} v[4:5], off, s[0:3], 0 glc +;CHECK-DAG: buffer_load_{{dword|b32}} v6, off, s[0:3], 0 slc ;CHECK: s_waitcnt define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) { main_body: Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll @@ -18,8 +18,8 @@ ; GFX10-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX10-NEXT: s_and_saveexec_b32 s0, s0 ; GFX10-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen -; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 Index: llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll +++ llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll @@ -5,7 +5,7 @@ ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9CHECK %s ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9CHECK %s ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10CHECK %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10CHECK %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-GISEL-CHECK %s ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11CHECK %s ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11CHECK %s @@ -916,40 +916,78 @@ ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v0, 3 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v1, 3 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v2, 3 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v3, 3 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v4, 3 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v5, 3 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v6, 3 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v7, 3 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4 -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v8, 3 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s4 -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v9, 3 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v10, 3 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s4 -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v11, 3 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v12, 3 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v13, 3 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4 -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v14, 3 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, s4 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s8, v0, 3 ; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s4, v15, 3 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s5, v14, 3 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s6, v13, 3 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s7, v12, 3 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s8, v1, 3 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, s5 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s6 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s7 ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s8 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s8, v2, 3 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s8, v3, 3 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s8, v4, 3 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s8, v5, 3 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s8 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s8, v6, 3 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, s8 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s8, v7, 3 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s8 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s8, v8, 3 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s8 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s8, v9, 3 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s8, v10, 3 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s8 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s8, v11, 3 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s8 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; +; GFX10-GISEL-CHECK-LABEL: isnan_v16f32: +; GFX10-GISEL-CHECK: ; %bb.0: +; GFX10-GISEL-CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s8, v0, 3 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s4, v12, 3 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s5, v13, 3 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s6, v14, 3 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s7, v15, 3 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s8, v1, 3 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s5 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, s6 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, s7 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s8 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s8, v2, 3 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s8, v3, 3 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s8, v4, 3 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s8, v5, 3 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s8 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s8, v6, 3 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, s8 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s8, v7, 3 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s8 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s8, v8, 3 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s8 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s8, v9, 3 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s8, v10, 3 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s8 +; GFX10-GISEL-CHECK-NEXT: v_cmp_class_f32_e64 s8, v11, 3 +; GFX10-GISEL-CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s8 +; GFX10-GISEL-CHECK-NEXT: s_setpc_b64 s[30:31] +; ; GFX11CHECK-LABEL: isnan_v16f32: ; GFX11CHECK: ; %bb.0: ; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) Index: llvm/test/CodeGen/AMDGPU/llvm.mulo.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -59,13 +59,13 @@ ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0 ; GFX10-NEXT: v_mad_u64_u32 v[6:7], s4, v4, v3, 0 ; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v5, v2, 0 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v5, v3, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 ; GFX10-NEXT: v_add3_u32 v1, v1, v6, v8 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v7, vcc_lo ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v8 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v9, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v2, v9, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v5, v3, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo @@ -184,30 +184,30 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0 ; GFX10-NEXT: v_mad_u64_u32 v[6:7], s4, v4, v3, 0 -; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v5, v2, 0 -; GFX10-NEXT: v_mad_i64_i32 v[11:12], s4, v5, v3, 0 ; GFX10-NEXT: v_mov_b32_e32 v8, v1 -; GFX10-NEXT: v_add3_u32 v1, v1, v6, v9 -; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v6 +; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v8, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v5, v2, 0 +; GFX10-NEXT: v_add3_u32 v1, v1, v6, v7 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v9, v7 +; GFX10-NEXT: v_mad_i64_i32 v[6:7], s4, v5, v3, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v10, v8, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v9 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v10, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v12, vcc_lo -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v7, v11 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v8, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v6, v2 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, 0, v7, vcc_lo ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v2, v4 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, 0, v5, vcc_lo ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, v4 +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -600,12 +600,11 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] -; GFX10-NEXT: v_alignbit_b32 v3, v1, v0, 30 -; GFX10-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5] -; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_lshlrev_b64 v[3:4], 2, v[0:1] +; GFX10-NEXT: v_ashrrev_i64 v[4:5], 2, v[3:4] +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, v1, v0, 30 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -658,14 +657,13 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] -; GFX10-NEXT: v_alignbit_b32 v3, v1, v0, 30 -; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 0x3fffffff, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[3:4], 2, v[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, v1, v0, 30 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: umulo_i64_v_4: Index: llvm/test/CodeGen/AMDGPU/load-local.128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/load-local.128.ll +++ llvm/test/CodeGen/AMDGPU/load-local.128.ll @@ -233,30 +233,30 @@ ; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 ; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 ; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 -; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 -; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 -; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 -; GFX10-NEXT: ds_read_u8 v12, v0 offset:11 -; GFX10-NEXT: ds_read_u8 v13, v0 offset:12 -; GFX10-NEXT: ds_read_u8 v14, v0 offset:13 -; GFX10-NEXT: ds_read_u8 v15, v0 offset:14 -; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 -; GFX10-NEXT: s_waitcnt lgkmcnt(14) +; GFX10-NEXT: s_waitcnt lgkmcnt(6) ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(12) +; GFX10-NEXT: s_waitcnt lgkmcnt(4) ; GFX10-NEXT: v_lshl_or_b32 v2, v4, 8, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(10) +; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_lshl_or_b32 v3, v6, 8, v5 -; GFX10-NEXT: s_waitcnt lgkmcnt(8) +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshl_or_b32 v4, v8, 8, v7 +; GFX10-NEXT: ds_read_u8 v5, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v6, v0 offset:9 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v8, v0 offset:11 +; GFX10-NEXT: ds_read_u8 v9, v0 offset:12 +; GFX10-NEXT: ds_read_u8 v10, v0 offset:13 +; GFX10-NEXT: ds_read_u8 v11, v0 offset:14 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 ; GFX10-NEXT: s_waitcnt lgkmcnt(6) -; GFX10-NEXT: v_lshl_or_b32 v5, v10, 8, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v6, 8, v5 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_lshl_or_b32 v6, v12, 8, v11 +; GFX10-NEXT: v_lshl_or_b32 v6, v8, 8, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_lshl_or_b32 v7, v14, 8, v13 +; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshl_or_b32 v8, v0, 8, v15 +; GFX10-NEXT: v_lshl_or_b32 v8, v0, 8, v11 ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 Index: llvm/test/CodeGen/AMDGPU/load-local.96.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/load-local.96.ll +++ llvm/test/CodeGen/AMDGPU/load-local.96.ll @@ -198,22 +198,22 @@ ; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 ; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 ; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 -; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 -; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 -; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 -; GFX10-NEXT: ds_read_u8 v0, v0 offset:11 -; GFX10-NEXT: s_waitcnt lgkmcnt(10) +; GFX10-NEXT: s_waitcnt lgkmcnt(6) ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(8) +; GFX10-NEXT: s_waitcnt lgkmcnt(4) ; GFX10-NEXT: v_lshl_or_b32 v2, v4, 8, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(6) +; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_lshl_or_b32 v3, v6, 8, v5 -; GFX10-NEXT: s_waitcnt lgkmcnt(4) +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshl_or_b32 v4, v8, 8, v7 +; GFX10-NEXT: ds_read_u8 v5, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v6, v0 offset:9 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:11 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_lshl_or_b32 v5, v10, 8, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v6, 8, v5 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshl_or_b32 v6, v0, 8, v11 +; GFX10-NEXT: v_lshl_or_b32 v6, v0, 8, v7 ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 Index: llvm/test/CodeGen/AMDGPU/memcpy-scoped-aa.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memcpy-scoped-aa.ll +++ llvm/test/CodeGen/AMDGPU/memcpy-scoped-aa.ll @@ -20,7 +20,7 @@ ; CHECK-DAG: global_load_dwordx2 v[[[Q0:[0-9]+]]:[[Q1:[0-9]+]]], v[2:3], off ; CHECK-DAG: global_load_dwordx4 [[PVAL:v\[[0-9]+:[0-9]+\]]], v[0:1], off offset:16 ; CHECK-DAG: v_add_nc_u32_e32 v{{[0-9]+}}, v[[Q0]], v[[Q1]] -; CHECK: global_store_dwordx4 v[0:1], [[PVAL]], off +; CHECK-DAG: global_store_dwordx4 v[0:1], [[PVAL]], off ; CHECK: s_setpc_b64 s[30:31] %add.ptr = getelementptr inbounds i32, ptr addrspace(1) %p, i64 4 tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 4 dereferenceable(16) %p, ptr addrspace(1) noundef nonnull align 4 dereferenceable(16) %add.ptr, i64 16, i1 false), !alias.scope !2, !noalias !4 @@ -40,7 +40,7 @@ ; CHECK-DAG: global_load_dwordx2 v[[[Q0:[0-9]+]]:[[Q1:[0-9]+]]], v[2:3], off ; CHECK-DAG: global_load_dwordx4 [[PVAL:v\[[0-9]+:[0-9]+\]]], v[0:1], off offset:16 ; CHECK-DAG: v_add_nc_u32_e32 v{{[0-9]+}}, v[[Q0]], v[[Q1]] -; CHECK: global_store_dwordx4 v[0:1], [[PVAL]], off +; CHECK-DAG: global_store_dwordx4 v[0:1], [[PVAL]], off ; CHECK: s_setpc_b64 s[30:31] %add.ptr = getelementptr inbounds i32, ptr addrspace(1) %p, i64 4 tail call void @llvm.memcpy.inline.p1.p1.i64(ptr addrspace(1) noundef nonnull align 4 dereferenceable(16) %p, ptr addrspace(1) noundef nonnull align 4 dereferenceable(16) %add.ptr, i64 16, i1 false), !alias.scope !2, !noalias !4 @@ -60,7 +60,7 @@ ; CHECK-DAG: global_load_dwordx2 v[[[Q0:[0-9]+]]:[[Q1:[0-9]+]]], v[2:3], off ; CHECK-DAG: global_load_dwordx4 [[PVAL:v\[[0-9]+:[0-9]+\]]], v[0:1], off offset:16 ; CHECK-DAG: v_add_nc_u32_e32 v{{[0-9]+}}, v[[Q0]], v[[Q1]] -; CHECK: global_store_dwordx4 v[0:1], [[PVAL]] +; CHECK-DAG: global_store_dwordx4 v[0:1], [[PVAL]] ; CHECK: s_setpc_b64 s[30:31] %add.ptr = getelementptr inbounds i32, ptr addrspace(1) %p, i64 4 tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 4 dereferenceable(16) %p, ptr addrspace(1) noundef nonnull align 4 dereferenceable(16) %add.ptr, i64 16, i1 false), !alias.scope !2, !noalias !4 @@ -78,7 +78,7 @@ ; CHECK-LABEL: test_memset: ; CHECK-DAG: global_load_dwordx2 v[[[Q0:[0-9]+]]:[[Q1:[0-9]+]]], v[2:3], off ; CHECK-DAG: v_mov_b32_e32 v[[PVAL:[0-9]+]], 0xaaaaaaaa -; CHECK: global_store_dwordx4 v[0:1], v[[[PVAL]]{{:[0-9]+\]}}, off +; CHECK-DAG: global_store_dwordx4 v[0:1], v[[[PVAL]]{{:[0-9]+\]}}, off ; CHECK: v_add_nc_u32_e32 v{{[0-9]+}}, v[[Q0]], v[[Q1]] ; CHECK: s_setpc_b64 s[30:31] tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull align 4 dereferenceable(16) %p, i8 170, i64 16, i1 false), !alias.scope !2, !noalias !4 Index: llvm/test/CodeGen/AMDGPU/memory_clause.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -25,21 +25,21 @@ ; GCN-SCRATCH-LABEL: vector_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb ; GCN-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v16, 4, v0 +; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-SCRATCH-NEXT: s_clause 0x3 -; GCN-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; GCN-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 -; GCN-SCRATCH-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 -; GCN-SCRATCH-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(3) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(2) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; GCN-SCRATCH-NEXT: s_clause 0x1 +; GCN-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] +; GCN-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 +; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] +; GCN-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] offset:32 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 +; GCN-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:48 +; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] offset:32 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:48 ; GCN-SCRATCH-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -99,7 +99,7 @@ ; GCN-SCRATCH-LABEL: scalar_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb ; GCN-SCRATCH-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) @@ -111,18 +111,18 @@ ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v5, s5 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v6, s6 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v7, s7 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v8, s8 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v9, s9 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v10, s10 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v11, s11 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v12, s12 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v13, s13 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v14, s14 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v15, s15 -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[0:3], s[18:19] -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[4:7], s[18:19] offset:16 -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[8:11], s[18:19] offset:32 -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[12:15], s[18:19] offset:48 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[18:19] +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, s8 +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, s9 +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v2, s10 +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v3, s11 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[18:19] offset:16 +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v4, s12 +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v5, s13 +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v6, s14 +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v7, s15 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[18:19] offset:32 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[18:19] offset:48 ; GCN-SCRATCH-NEXT: s_endpgm bb: %tmp = load <4 x i32>, ptr addrspace(1) %arg, align 16 @@ -207,22 +207,23 @@ ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v2, 4, v31 -; GCN-SCRATCH-NEXT: v_and_b32_e32 v18, 0x3ff0, v2 -; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v0, v0, v18 -; GCN-SCRATCH-NEXT: s_clause 0x3 -; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[2:5], v0, off -; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[6:9], v0, off offset:16 -; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[10:13], v0, off offset:32 -; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[14:17], v0, off offset:48 -; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v0, v1, v18 -; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(3) -; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v0, v[2:5], off -; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(2) -; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16 +; GCN-SCRATCH-NEXT: v_and_b32_e32 v10, 0x3ff0, v2 +; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v11, v0, v10 +; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v10, v1, v10 +; GCN-SCRATCH-NEXT: s_clause 0x1 +; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[2:5], v11, off +; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[6:9], v11, off offset:16 +; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) +; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v10, v[2:5], off +; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) +; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v10, v[6:9], off offset:16 +; GCN-SCRATCH-NEXT: s_clause 0x1 +; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[0:3], v11, off offset:32 +; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[4:7], v11, off offset:48 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32 +; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v10, v[0:3], off offset:32 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v0, v[14:17], off offset:48 +; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v10, v[4:7], off offset:48 ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-SCRATCH-NEXT: s_setpc_b64 s[30:31] bb: Index: llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -49,8 +49,8 @@ ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen -; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4 +; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB0_1 @@ -315,8 +315,8 @@ ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen -; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr8 +; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB1_3 @@ -757,7 +757,7 @@ ; GFX1010_W32-NEXT: ;;#ASMSTART ; GFX1010_W32-NEXT: s_mov_b32 s4, 17 ; GFX1010_W32-NEXT: ;;#ASMEND -; GFX1010_W32-NEXT: v_mov_b32_e32 v8, s4 +; GFX1010_W32-NEXT: v_mov_b32_e32 v9, s4 ; GFX1010_W32-NEXT: s_mov_b32 s6, exec_lo ; GFX1010_W32-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v0 @@ -768,9 +768,9 @@ ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[2:3] ; GFX1010_W32-NEXT: s_and_b32 s5, vcc_lo, s5 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s5, s5 -; GFX1010_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1010_W32-NEXT: buffer_load_format_x v8, v9, s[8:11], 0 idxen +; GFX1010_W32-NEXT: ; implicit-def: $vgpr9 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1010_W32-NEXT: ; implicit-def: $vgpr8 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s5 ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_1 @@ -792,9 +792,9 @@ ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[6:7] ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 -; GFX1010_W32-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen -; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX1010_W32-NEXT: buffer_load_format_x v8, v0, s[8:11], 0 idxen ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0 +; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_4 @@ -803,7 +803,7 @@ ; GFX1010_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) -; GFX1010_W32-NEXT: global_store_dword v[11:12], v9, off +; GFX1010_W32-NEXT: global_store_dword v[11:12], v8, off ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W32-NEXT: s_setpc_b64 s[30:31] ; @@ -814,7 +814,7 @@ ; GFX1010_W64-NEXT: ;;#ASMSTART ; GFX1010_W64-NEXT: s_mov_b32 s4, 17 ; GFX1010_W64-NEXT: ;;#ASMEND -; GFX1010_W64-NEXT: v_mov_b32_e32 v8, s4 +; GFX1010_W64-NEXT: v_mov_b32_e32 v9, s4 ; GFX1010_W64-NEXT: s_mov_b64 s[12:13], exec ; GFX1010_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v0 @@ -825,9 +825,9 @@ ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[2:3] ; GFX1010_W64-NEXT: s_and_b64 s[6:7], vcc, s[6:7] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] -; GFX1010_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1010_W64-NEXT: buffer_load_format_x v8, v9, s[8:11], 0 idxen ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1010_W64-NEXT: ; implicit-def: $vgpr8 +; GFX1010_W64-NEXT: ; implicit-def: $vgpr9 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX1010_W64-NEXT: s_cbranch_execnz .LBB2_1 @@ -849,7 +849,7 @@ ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7] ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX1010_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX1010_W64-NEXT: buffer_load_format_x v8, v0, s[8:11], 0 idxen ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 @@ -860,7 +860,7 @@ ; GFX1010_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W64-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) -; GFX1010_W64-NEXT: global_store_dword v[11:12], v9, off +; GFX1010_W64-NEXT: global_store_dword v[11:12], v8, off ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W64-NEXT: s_setpc_b64 s[30:31] ; @@ -932,7 +932,7 @@ ; GFX1100_W64-NEXT: ;;#ASMSTART ; GFX1100_W64-NEXT: s_mov_b32 s4, 17 ; GFX1100_W64-NEXT: ;;#ASMEND -; GFX1100_W64-NEXT: v_mov_b32_e32 v8, s4 +; GFX1100_W64-NEXT: v_mov_b32_e32 v9, s4 ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX1100_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s8, v0 @@ -945,9 +945,9 @@ ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX1100_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1100_W64-NEXT: buffer_load_format_x v8, v9, s[8:11], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1100_W64-NEXT: ; implicit-def: $vgpr8 +; GFX1100_W64-NEXT: ; implicit-def: $vgpr9 ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX1100_W64-NEXT: s_cbranch_execnz .LBB2_1 ; GFX1100_W64-NEXT: ; %bb.2: @@ -971,7 +971,7 @@ ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX1100_W64-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen +; GFX1100_W64-NEXT: buffer_load_format_x v8, v0, s[4:7], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0 ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] @@ -982,7 +982,7 @@ ; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) -; GFX1100_W64-NEXT: global_store_b32 v[11:12], v9, off dlc +; GFX1100_W64-NEXT: global_store_b32 v[11:12], v8, off dlc ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W64-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -192,45 +192,41 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x1000 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, 0x2000 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[8:9], v[4:5], off offset:-2048 -; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[12:13], v[4:5], off -; GFX10-NEXT: global_load_dwordx2 v[14:15], v[10:11], off offset:-2048 -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x3000 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, 0x3800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_dwordx2 v[8:9], v[4:5], off +; GFX10-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[12:13], v[6:7], off +; GFX10-NEXT: global_load_dwordx2 v[14:15], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v10, v14 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v11, v15, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x2000 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[16:17], v[10:11], off -; GFX10-NEXT: global_load_dwordx2 v[18:19], v[4:5], off offset:-2048 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3800, v0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 0x3000 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[10:11], v[4:5], off -; GFX10-NEXT: global_load_dwordx2 v[20:21], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, v6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v7, vcc_lo -; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v12, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo -; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v14, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v16, vcc_lo, v8, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v9, v7, vcc_lo +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[8:9], v[4:5], off +; GFX10-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[14:15], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v16, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v16 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v17, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v18, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v10, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v20, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v21, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v14, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v12, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -685,9 +681,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 17, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_movk_i32 s1, 0x7f +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: s_movk_i32 s0, 0x7f +; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] ; GFX10-NEXT: v_and_b32_e32 v0, 0xfe000000, v0 ; GFX10-NEXT: v_or_b32_e32 v1, v0, v1 @@ -698,82 +694,85 @@ ; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB1_2 Depth 2 -; GFX10-NEXT: v_mov_b32_e32 v6, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: .LBB1_2: ; %for.body ; GFX10-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v5, 0xffffb800 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v6, vcc_lo -; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v5, 0xffffc800 -; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, -1, v6, vcc_lo -; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v5, 0xffffd800 -; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, -1, v6, vcc_lo -; GFX10-NEXT: v_add_co_u32 v17, vcc_lo, v5, 0xffffe800 -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: global_load_dwordx2 v[11:12], v[7:8], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[15:16], v[9:10], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[19:20], v[13:14], off offset:-2048 -; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, -1, v6, vcc_lo -; GFX10-NEXT: v_add_co_u32 v21, vcc_lo, 0xfffff000, v5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, -1, v6, vcc_lo -; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_dwordx2 v[23:24], v[17:18], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[7:8], v[7:8], off -; GFX10-NEXT: global_load_dwordx2 v[9:10], v[9:10], off -; GFX10-NEXT: global_load_dwordx2 v[13:14], v[13:14], off -; GFX10-NEXT: global_load_dwordx2 v[25:26], v[17:18], off -; GFX10-NEXT: global_load_dwordx2 v[27:28], v[21:22], off -; GFX10-NEXT: global_load_dwordx2 v[29:30], v[5:6], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[31:32], v[5:6], off -; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, 0x10000, v5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo -; GFX10-NEXT: s_addk_i32 s2, 0x2000 -; GFX10-NEXT: s_cmp_gt_u32 s2, 0x3fffff -; GFX10-NEXT: s_waitcnt vmcnt(10) -; GFX10-NEXT: v_add_co_u32 v3, s0, v11, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v12, v4, s0 -; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_add_co_u32 v3, s0, v7, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v8, v4, s0 -; GFX10-NEXT: v_add_co_u32 v3, s0, v15, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v16, v4, s0 -; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_add_co_u32 v3, s0, v9, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v10, v4, s0 -; GFX10-NEXT: v_add_co_u32 v3, s0, v19, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v20, v4, s0 -; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_add_co_u32 v3, s0, v13, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v14, v4, s0 -; GFX10-NEXT: v_add_co_u32 v3, s0, v23, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v24, v4, s0 +; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v3, 0xffffb800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v3, 0xffffc800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, -1, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, 0xfffff000, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_dwordx2 v[13:14], v[7:8], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[15:16], v[7:8], off +; GFX10-NEXT: global_load_dwordx2 v[17:18], v[9:10], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[19:20], v[11:12], off +; GFX10-NEXT: s_addk_i32 s1, 0x2000 +; GFX10-NEXT: s_cmp_gt_u32 s1, 0x3fffff +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v13, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v14, v6, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v15, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v16, v6, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_add_co_u32 v21, vcc_lo, v17, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, v18, v6, vcc_lo +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v3, 0xffffd800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v3, 0xffffe800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_dwordx2 v[11:12], v[9:10], off +; GFX10-NEXT: global_load_dwordx2 v[13:14], v[5:6], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[15:16], v[5:6], off +; GFX10-NEXT: global_load_dwordx2 v[17:18], v[7:8], off offset:-2048 ; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_add_co_u32 v3, s0, v25, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v26, v4, s0 +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v11, v21 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v12, v22, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_add_co_u32 v3, s0, v27, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v28, v4, s0 +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v13, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v14, v6, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_add_co_u32 v3, s0, v29, v3 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, v30, v4, s0 +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v15, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v16, v6, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v31, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v32, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v17, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, v18, v6, vcc_lo +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: global_load_dwordx2 v[5:6], v[7:8], off +; GFX10-NEXT: global_load_dwordx2 v[9:10], v[3:4], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[11:12], v[3:4], off +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v5, v13 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v19, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v20, v6, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v9, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v10, v6, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v11, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v12, v6, vcc_lo +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x10000, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v4, vcc_lo ; GFX10-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX10-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX10-NEXT: s_add_i32 s0, s1, -1 -; GFX10-NEXT: s_cmp_eq_u32 s1, 0 +; GFX10-NEXT: s_add_i32 s1, s0, -1 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_mov_b32 s0, s1 ; GFX10-NEXT: s_branch .LBB1_1 ; GFX10-NEXT: .LBB1_5: ; %while.end ; GFX10-NEXT: v_add_co_u32 v0, s0, s34, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s35, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[0:1], v[3:4], off +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[5:6], off ; GFX10-NEXT: s_endpgm ; ; GFX90A-LABEL: clmem_read: @@ -904,8 +903,8 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xff, v0 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 17, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 17, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_movk_i32 s1, 0x7f ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] @@ -922,84 +921,83 @@ ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v5, v1 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: .LBB1_2: ; %for.body ; GFX11-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v5, 0xffffc000 -; GFX11-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v6, vcc_lo -; GFX11-NEXT: v_add_co_u32 v9, vcc_lo, 0xffffc000, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, -1, v6, vcc_lo -; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, 0xffffd000, v5 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[13:14], v[7:8], off offset:-4096 +; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v3, 0xffffc000 +; GFX11-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v4, vcc_lo +; GFX11-NEXT: v_add_co_u32 v9, vcc_lo, 0xffffc000, v3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, -1, v4, vcc_lo +; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, 0xffffd000, v3 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_load_b64 v[17:18], v[7:8], off offset:-4096 ; GFX11-NEXT: global_load_b64 v[9:10], v[9:10], off offset:-2048 -; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, -1, v6, vcc_lo -; GFX11-NEXT: v_add_co_u32 v15, vcc_lo, v5, 0xffffe000 -; GFX11-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, -1, v6, vcc_lo -; GFX11-NEXT: global_load_b64 v[11:12], v[11:12], off offset:-2048 -; GFX11-NEXT: v_add_co_u32 v17, vcc_lo, 0xffffe000, v5 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[19:20], v[15:16], off offset:-4096 ; GFX11-NEXT: global_load_b64 v[7:8], v[7:8], off -; GFX11-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, -1, v6, vcc_lo -; GFX11-NEXT: v_add_co_u32 v21, vcc_lo, 0xfffff000, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, -1, v6, vcc_lo -; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: global_load_b64 v[17:18], v[17:18], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[15:16], v[15:16], off -; GFX11-NEXT: global_load_b64 v[21:22], v[21:22], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[23:24], v[5:6], off offset:-4096 -; GFX11-NEXT: global_load_b64 v[25:26], v[5:6], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[27:28], v[5:6], off -; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, 0x10000, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, -1, v4, vcc_lo +; GFX11-NEXT: v_add_co_u32 v13, vcc_lo, v3, 0xffffe000 +; GFX11-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, -1, v4, vcc_lo +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_load_b64 v[11:12], v[11:12], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[19:20], v[13:14], off offset:-4096 +; GFX11-NEXT: global_load_b64 v[13:14], v[13:14], off +; GFX11-NEXT: v_add_co_u32 v15, vcc_lo, 0xffffe000, v3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, -1, v4, vcc_lo ; GFX11-NEXT: s_addk_i32 s2, 0x2000 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_gt_u32 s2, 0x3fffff -; GFX11-NEXT: s_waitcnt vmcnt(10) -; GFX11-NEXT: v_add_co_u32 v3, s0, v13, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v14, v4, s0 -; GFX11-NEXT: s_waitcnt vmcnt(9) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v9, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v10, v4, s0 +; GFX11-NEXT: global_load_b64 v[15:16], v[15:16], off offset:-2048 ; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v7, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v8, v4, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v11, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v12, v4, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v19, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v20, v4, s0 +; GFX11-NEXT: v_add_co_u32 v21, vcc_lo, v17, v5 +; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, v18, v6, vcc_lo +; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, 0xfffff000, v3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v17, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v18, v4, s0 +; GFX11-NEXT: v_add_co_u32 v21, vcc_lo, v9, v21 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[5:6], v[5:6], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[17:18], v[3:4], off offset:-4096 +; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, v10, v22, vcc_lo +; GFX11-NEXT: global_load_b64 v[9:10], v[3:4], off offset:-2048 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: v_add_co_u32 v21, vcc_lo, v7, v21 +; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, v8, v22, vcc_lo +; GFX11-NEXT: global_load_b64 v[7:8], v[3:4], off +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: v_add_co_u32 v11, s0, v11, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e64 v12, s0, v12, v22, s0 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x10000, v3 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_add_co_u32 v11, s0, v19, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e64 v12, s0, v20, v12, s0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v4, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v15, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v16, v4, s0 +; GFX11-NEXT: v_add_co_u32 v11, s0, v15, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v12, s0, v16, v12, s0 +; GFX11-NEXT: v_add_co_u32 v11, s0, v13, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v12, s0, v14, v12, s0 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v21, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v22, v4, s0 +; GFX11-NEXT: v_add_co_u32 v5, s0, v5, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v6, v12, s0 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v23, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v24, v4, s0 +; GFX11-NEXT: v_add_co_u32 v5, s0, v17, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v18, v6, s0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v3, s0, v25, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, v26, v4, s0 +; GFX11-NEXT: v_add_co_u32 v5, s0, v9, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v10, v6, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, v27, v3 -; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v28, v4, vcc_lo +; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, v7, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v8, v6, vcc_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX11-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -1013,7 +1011,7 @@ ; GFX11-NEXT: v_add_co_u32 v0, s0, s34, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0 -; GFX11-NEXT: global_store_b64 v[0:1], v[3:4], off +; GFX11-NEXT: global_store_b64 v[0:1], v[5:6], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1290,39 +1288,37 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, 0x1000 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, 0x1000, v0 -; GFX10-NEXT: s_clause 0x4 -; GFX10-NEXT: global_load_dword v10, v[0:1], off -; GFX10-NEXT: global_load_dword v11, v[0:1], off offset:1024 -; GFX10-NEXT: global_load_dword v12, v[4:5], off offset:1024 -; GFX10-NEXT: global_load_dword v13, v[6:7], off offset:-2048 -; GFX10-NEXT: global_load_dword v14, v[6:7], off -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x1800, v0 +; GFX10-NEXT: global_load_dword v6, v[4:5], off offset:1024 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x1000 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, 0x2000 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v15, v[8:9], off offset:1024 -; GFX10-NEXT: global_load_dword v16, v[4:5], off offset:1024 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v7, v[4:5], off offset:-2048 +; GFX10-NEXT: global_load_dword v8, v[4:5], off +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x1000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v9, v[4:5], off offset:1024 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x1800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v10, v[4:5], off offset:1024 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x2000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: global_load_dword v4, v[6:7], off offset:-2048 -; GFX10-NEXT: global_load_dword v5, v[6:7], off -; GFX10-NEXT: global_load_dword v8, v[0:1], off offset:1024 -; GFX10-NEXT: s_waitcnt vmcnt(8) -; GFX10-NEXT: v_add_nc_u32_e32 v0, v11, v10 -; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_add3_u32 v0, v13, v0, v12 -; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_add3_u32 v0, v14, v0, v15 -; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_add3_u32 v0, v4, v0, v16 +; GFX10-NEXT: global_load_dword v11, v[4:5], off offset:1024 +; GFX10-NEXT: global_load_dword v12, v[0:1], off +; GFX10-NEXT: global_load_dword v13, v[0:1], off offset:1024 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 0x2000 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v4, v13, v12 +; GFX10-NEXT: v_add3_u32 v4, v7, v4, v6 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: v_add3_u32 v4, v8, v4, v9 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_add3_u32 v0, v5, v4, v10 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add3_u32 v0, v5, v0, v8 +; GFX10-NEXT: v_add3_u32 v0, v6, v0, v11 ; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm ; @@ -1626,23 +1622,22 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0xfffff800 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[8:9], v[4:5], off offset:-2048 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 1, v1, vcc_lo -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[10:11], v[4:5], off -; GFX10-NEXT: global_load_dwordx2 v[12:13], v[0:1], off +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, 0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 1, v1, vcc_lo +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_dwordx2 v[8:9], v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[12:13], v[4:5], off +; GFX10-NEXT: global_load_dwordx2 v[14:15], v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, v6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v7, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v10, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v9, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v10, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v12, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v14, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -1898,17 +1893,17 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x80000000 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: global_load_dword v6, v[0:1], off -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: global_load_dword v7, v[4:5], off offset:-2048 -; GFX10-NEXT: global_load_dword v8, v[4:5], off -; GFX10-NEXT: global_load_dword v9, v[0:1], off offset:1024 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, 0x7ffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_dword v8, v[0:1], off +; GFX10-NEXT: global_load_dword v9, v[4:5], off offset:-2048 +; GFX10-NEXT: global_load_dword v10, v[4:5], off +; GFX10-NEXT: global_load_dword v11, v[6:7], off offset:1024 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_add_nc_u32_e32 v0, v7, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v9, v8 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add3_u32 v0, v9, v0, v8 +; GFX10-NEXT: v_add3_u32 v0, v11, v0, v10 ; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm ; @@ -2175,41 +2170,41 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff8000, v0 -; GFX10-NEXT: v_add_co_u32 v0, s0, s36, v2 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v0 +; GFX10-NEXT: v_add_co_u32 v0, s0, s36, v8 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s37, 0, s0 -; GFX10-NEXT: v_add_co_u32 v14, s0, s38, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s0, s39, 0, s0 +; GFX10-NEXT: v_add_co_u32 v8, s0, s38, v8 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x1800 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v14, 0x3000 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v15, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s0, s39, 0, s0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v6, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v7, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v8, 0x3000 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v6, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x2000, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[12:13], v[4:5], off -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x3800, v14 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v15, vcc_lo -; GFX10-NEXT: global_load_dwordx2 v[14:15], v[2:3], off -; GFX10-NEXT: global_load_dwordx2 v[16:17], v[4:5], off -; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v8, v6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v7, vcc_lo -; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v12, v10 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v13, v11, vcc_lo -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo +; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v9 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v10, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x3800, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo +; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v16, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v17, v5, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v6, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm ; @@ -2488,47 +2483,45 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, 0x3000, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[8:9], v[0:1], off +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, 0x2800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x3 ; GFX10-NEXT: global_load_dwordx2 v[10:11], v[4:5], off -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x2800, v0 +; GFX10-NEXT: global_load_dwordx2 v[12:13], v[6:7], off +; GFX10-NEXT: global_load_dwordx2 v[14:15], v[8:9], off +; GFX10-NEXT: global_load_dwordx2 v[16:17], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v10, v16 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v11, v17, vcc_lo +; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v12, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v13, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x2000, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, 0x2000, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, 0x1800, v0 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[6:7], off -; GFX10-NEXT: global_load_dwordx2 v[12:13], v[12:13], off -; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v16, vcc_lo, 0x1000, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[14:15], v[14:15], off -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[4:5], off +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, 0x1800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, 0x1000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[18:19], v[16:17], off -; GFX10-NEXT: global_load_dwordx2 v[20:21], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v10, v8 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v9, vcc_lo -; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v18, vcc_lo, v14, v10 +; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v15, v11, vcc_lo +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_dwordx2 v[10:11], v[4:5], off +; GFX10-NEXT: global_load_dwordx2 v[12:13], v[6:7], off +; GFX10-NEXT: global_load_dwordx2 v[14:15], v[8:9], off +; GFX10-NEXT: global_load_dwordx2 v[16:17], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v10, v18 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v19, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v12, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v14, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v18, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v20, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v21, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v16, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/saddo.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/saddo.ll +++ llvm/test/CodeGen/AMDGPU/saddo.ll @@ -672,22 +672,22 @@ ; GFX10-LABEL: v_saddo_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[4:5] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[4:5] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v4, v1, v3 -; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp -; GFX10-NEXT: v_add_nc_u32_e32 v3, v0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v1, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v0, v2 ; GFX10-NEXT: v_add_nc_i32 v0, v0, v2 clamp -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0 +; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp +; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, v5, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[0:1] -; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_saddo_v2i32: Index: llvm/test/CodeGen/AMDGPU/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -457,12 +457,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 -; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll +++ llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll @@ -1,5 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=MISCHED %s ; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s ; Test the scheduler when only one wave is requested. The result should be high register usage and max ILP. @@ -597,4 +599,4 @@ declare float @llvm.fmuladd.f32(float, float, float) #0 attributes #0 = { nounwind readnone } -attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" } +attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,128" } Index: llvm/test/CodeGen/AMDGPU/scratch-simple.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -77,8 +77,8 @@ ; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]] ; FLATSCR: v_add{{_|_nc_}}{{u32|b32}}_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]] -; MUBUF: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; MUBUF: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[LO_OFF]], off ; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, [[CLAMP_IDX]], off offset:128 define amdgpu_ps float @ps_main(i32 %idx) { Index: llvm/test/CodeGen/AMDGPU/smrd.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/smrd.ll +++ llvm/test/CodeGen/AMDGPU/smrd.ll @@ -443,8 +443,8 @@ ; GCN-LABEL: {{^}}smrd_vgpr_merged: ; GCN-NEXT: %bb. ; GFX10-NEXT: s_clause -; GCN-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 +; GCN-DAG: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +; GCN-DAG: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 define amdgpu_ps void @smrd_vgpr_merged(<4 x i32> inreg %desc, i32 %a) #0 { main_body: %a1 = add i32 %a, 4 Index: llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -30,6 +30,25 @@ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr9 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr10 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr8 + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) undef`, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: KILL undef %89:sgpr_128 + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM]] + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_U32_e64_1]], [[V_ADD_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_]], [[V_ADD_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -4, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_1]], [[V_ADD_U32_e64_3]], implicit $exec + ; CHECK-NEXT: [[V_SUBREV_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 27, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_2]], [[V_SUBREV_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_SUBREV_U32_e64_1:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 28, [[BUFFER_LOAD_DWORD_OFFSET1]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_4:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_3]], [[V_SUBREV_U32_e64_1]], implicit $exec ; CHECK-NEXT: undef %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %56, 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4) ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc @@ -40,18 +59,11 @@ ; CHECK-NEXT: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: %130.sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) undef`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %132:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: KILL undef %74:sreg_64 ; CHECK-NEXT: KILL undef %132:sgpr_128 ; CHECK-NEXT: KILL %130.sub0, %130.sub1 - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: undef %302.sub1:sgpr_128 = S_MOV_B32 0 - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: KILL undef %89:sgpr_128 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM1]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 29, implicit-def dead $scc ; CHECK-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc ; CHECK-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc @@ -59,316 +71,303 @@ ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %54:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef %149.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: %149.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef %156.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0 :: (invariant load (s128) from %ir.87, addrspace 4) + ; CHECK-NEXT: [[V_OR_B32_e64_5:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_SUB_I32_]], [[V_OR_B32_e64_4]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_6:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_SUB_I32_1]], [[V_OR_B32_e64_5]], implicit $exec + ; CHECK-NEXT: undef %156.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: %156.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0 :: (invariant load (s128) from %ir.92, addrspace 4) + ; CHECK-NEXT: [[V_OR_B32_e64_7:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_SUB_I32_2]], [[V_OR_B32_e64_6]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_SUBREV_U32_e64_2:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 32, [[BUFFER_LOAD_FORMAT_X_IDXEN2]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_8:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_7]], [[V_SUBREV_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_SUBREV_U32_e64_3:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 33, [[BUFFER_LOAD_FORMAT_X_IDXEN3]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_9:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_8]], [[V_SUBREV_U32_e64_3]], implicit $exec ; CHECK-NEXT: undef %163.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: %163.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %163, 0, 0 :: (invariant load (s128) from %ir.97, addrspace 4) ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %171:sreg_32, 31, implicit-def dead $scc ; CHECK-NEXT: undef %176.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], undef %171:sreg_32, implicit-def $scc ; CHECK-NEXT: %176.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0 :: (invariant load (s128) from %ir.104, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_SUBREV_U32_e64_4:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 34, [[BUFFER_LOAD_FORMAT_X_IDXEN4]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_10:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_9]], [[V_SUBREV_U32_e64_4]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_SUBREV_U32_e64_5:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 36, [[BUFFER_LOAD_FORMAT_X_IDXEN5]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_11:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_10]], [[V_SUBREV_U32_e64_5]], implicit $exec ; CHECK-NEXT: undef %183.sub0:sreg_64 = S_ADD_U32 %50.sub0, [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: %183.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0 :: (invariant load (s128) from %ir.109, addrspace 4) ; CHECK-NEXT: undef %190.sub0:sreg_64 = S_ADD_U32 %50.sub0, [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: %190.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0 :: (invariant load (s128) from %ir.114, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_SUBREV_U32_e64_6:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 37, [[BUFFER_LOAD_FORMAT_X_IDXEN6]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_12:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_11]], [[V_SUBREV_U32_e64_6]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_SUBREV_U32_e64_7:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 38, [[BUFFER_LOAD_FORMAT_X_IDXEN7]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_13:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_12]], [[V_SUBREV_U32_e64_7]], implicit $exec + ; CHECK-NEXT: [[V_SUBREV_U32_e64_8:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 39, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_14:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_13]], [[V_SUBREV_U32_e64_8]], implicit $exec ; CHECK-NEXT: undef %200.sub0:sreg_64 = S_ADD_U32 %50.sub0, undef %171:sreg_32, implicit-def $scc ; CHECK-NEXT: %200.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0 :: (invariant load (s128) from %ir.121, addrspace 4) ; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 %50.sub0, 224, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %51:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef %210.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: %210.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0 :: (invariant load (s128) from %ir.127, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_SUBREV_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 50, [[BUFFER_LOAD_FORMAT_X_IDXEN8]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_15:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_14]], [[V_SUBREV_U32_e64_9]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_SUBREV_U32_e64_10:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 51, [[BUFFER_LOAD_FORMAT_X_IDXEN9]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_16:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_15]], [[V_SUBREV_U32_e64_10]], implicit $exec ; CHECK-NEXT: undef %217.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: %217.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0 :: (invariant load (s128) from %ir.132, addrspace 4) ; CHECK-NEXT: undef %224.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: %224.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0 :: (invariant load (s128) from %ir.137, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_SUBREV_U32_e64_11:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 52, [[BUFFER_LOAD_FORMAT_X_IDXEN10]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_17:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_16]], [[V_SUBREV_U32_e64_11]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_SUBREV_U32_e64_12:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 53, [[BUFFER_LOAD_FORMAT_X_IDXEN11]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_18:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_17]], [[V_SUBREV_U32_e64_12]], implicit $exec + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0 :: (invariant load (s128) from %ir.126, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -72, [[BUFFER_LOAD_FORMAT_X_IDXEN12]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_19:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_18]], [[V_ADD_U32_e64_4]], implicit $exec ; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 %50.sub0, 576, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %51:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef %241.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: %241.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0 :: (invariant load (s128) from %ir.147, addrspace 4) ; CHECK-NEXT: undef %253.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: %253.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0 :: (invariant load (s128) from %ir.154, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -73, [[BUFFER_LOAD_FORMAT_X_IDXEN13]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_20:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_19]], [[V_ADD_U32_e64_5]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -74, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_21:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_20]], [[V_ADD_U32_e64_6]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -75, [[BUFFER_LOAD_FORMAT_X_IDXEN14]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_22:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_21]], [[V_ADD_U32_e64_7]], implicit $exec ; CHECK-NEXT: undef %261.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], undef %171:sreg_32, implicit-def $scc ; CHECK-NEXT: %261.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0 :: (invariant load (s128) from %ir.159, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -77, [[BUFFER_LOAD_FORMAT_X_IDXEN15]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_23:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_22]], [[V_ADD_U32_e64_8]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -93, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_24:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_23]], [[V_ADD_U32_e64_9]], implicit $exec ; CHECK-NEXT: undef %273.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: %273.sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %273, 0, 0 :: (invariant load (s128) from %ir.167, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -94, [[BUFFER_LOAD_FORMAT_X_IDXEN16]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_25:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_24]], [[V_ADD_U32_e64_10]], implicit $exec ; CHECK-NEXT: undef %286.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: %286.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0 :: (invariant load (s128) from %ir.175, addrspace 4) + ; CHECK-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -95, [[BUFFER_LOAD_DWORD_OFFSET]], 0, implicit $exec ; CHECK-NEXT: undef %293.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: %293.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %293, 0, 0 :: (invariant load (s128) from %ir.180, addrspace 4) + ; CHECK-NEXT: [[V_OR_B32_e64_26:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_25]], [[V_ADD_U32_e64_11]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -96, [[BUFFER_LOAD_DWORD_OFFSET2]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_27:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_26]], [[V_ADD_U32_e64_12]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: undef %302.sub1:sgpr_128 = S_MOV_B32 0 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %302, [[S_ADD_I32_]], 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %302, undef %314:sreg_32, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %302, [[S_ADD_I32_1]], 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %302, 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %369:sgpr_128, undef %370:sreg_32, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %380:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0 :: (invariant load (s128) from %ir.92, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %163, 0, 0 :: (invariant load (s128) from %ir.97, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0 :: (invariant load (s128) from %ir.104, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0 :: (invariant load (s128) from %ir.109, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0 :: (invariant load (s128) from %ir.114, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %364:sgpr_128, [[S_ADD_I32_]], 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %375:sgpr_128, [[S_ADD_I32_1]], 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: KILL undef %314:sreg_32 ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR]], -98, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR1]], -114, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR2]], -130, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc ; CHECK-NEXT: undef %327.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: %327.sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0 :: (invariant load (s128) from %ir.202, addrspace 4) + ; CHECK-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -97, [[BUFFER_LOAD_DWORD_OFFSET3]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_28:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_27]], [[V_ADD_U32_e64_13]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) ; CHECK-NEXT: undef %335.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: %335.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[V_OR_B32_e64_29:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_2]], [[V_OR_B32_e64_28]], implicit $exec + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0 :: (invariant load (s128) from %ir.208, addrspace 4) + ; CHECK-NEXT: [[V_OR_B32_e64_30:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_3]], [[V_OR_B32_e64_29]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_31:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_4]], [[V_OR_B32_e64_30]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -194, [[BUFFER_LOAD_FORMAT_X_IDXEN17]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_32:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_5]], [[V_OR_B32_e64_31]], implicit $exec ; CHECK-NEXT: undef %343.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0 :: (invariant load (s128) from %ir.121, addrspace 4) ; CHECK-NEXT: %343.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4) + ; CHECK-NEXT: [[V_OR_B32_e64_33:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_32]], [[V_ADD_U32_e64_14]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -195, [[BUFFER_LOAD_FORMAT_X_IDXEN18]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_34:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_33]], [[V_ADD_U32_e64_15]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) ; CHECK-NEXT: undef %351.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: %351.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4) + ; CHECK-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -196, [[BUFFER_LOAD_FORMAT_X_IDXEN19]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_35:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_34]], [[V_ADD_U32_e64_16]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %364:sgpr_128, [[S_ADD_I32_]], 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %369:sgpr_128, undef %370:sreg_32, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: KILL undef %370:sreg_32 + ; CHECK-NEXT: KILL undef %364:sgpr_128 + ; CHECK-NEXT: KILL undef %369:sgpr_128 + ; CHECK-NEXT: KILL [[S_ADD_I32_]] + ; CHECK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -197, [[BUFFER_LOAD_FORMAT_X_IDXEN20]], 0, implicit $exec + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %375:sgpr_128, [[S_ADD_I32_1]], 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[V_OR_B32_e64_36:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_35]], [[V_ADD_U32_e64_17]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -216, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %380:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY10]], 4, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_OR_B32_e64_37:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_36]], [[V_ADD_U32_e64_18]], implicit $exec ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -217, implicit-def dead $scc ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %396:sgpr_128, [[S_ADD_I32_6]], 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0 :: (invariant load (s128) from %ir.126, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0 :: (invariant load (s128) from %ir.127, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0 :: (invariant load (s128) from %ir.132, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0 :: (invariant load (s128) from %ir.137, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -217, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -233, implicit-def dead $scc + ; CHECK-NEXT: [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_37]], implicit $exec + ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -233, implicit-def dead $scc + ; CHECK-NEXT: [[V_OR_B32_e64_39:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_8]], [[V_OR_B32_e64_38]], implicit $exec ; CHECK-NEXT: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR5]], -249, implicit-def dead $scc + ; CHECK-NEXT: [[V_OR_B32_e64_40:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_9]], [[V_OR_B32_e64_39]], implicit $exec ; CHECK-NEXT: [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -313, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -329, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -345, implicit-def dead $scc + ; CHECK-NEXT: [[V_OR_B32_e64_41:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_10]], [[V_OR_B32_e64_40]], implicit $exec + ; CHECK-NEXT: [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -313, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -329, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -345, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR6]], -441, implicit-def dead $scc + ; CHECK-NEXT: [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_41]], implicit $exec ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], 160, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_3:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %36:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef %411.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: %411.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_42]], implicit $exec + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0 :: (invariant load (s128) from %ir.253, addrspace 4) + ; CHECK-NEXT: [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_43]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -457, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_44]], implicit $exec ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY11]], 4, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc + ; CHECK-NEXT: [[V_OR_B32_e64_46:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_45]], [[V_ADD_U32_e64_19]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -458, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: undef %425.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_4]], implicit-def $scc ; CHECK-NEXT: %425.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %425, 0, 0 :: (invariant load (s128) from %ir.261, addrspace 4) + ; CHECK-NEXT: [[V_OR_B32_e64_47:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_46]], [[V_ADD_U32_e64_20]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -459, [[BUFFER_LOAD_FORMAT_X_IDXEN21]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_48:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_47]], [[V_ADD_U32_e64_21]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]]:sreg_32 = S_ADD_U32 %56.sub0, 168, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %57:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0 :: (invariant load (s128) from %ir.147, addrspace 4) ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc ; CHECK-NEXT: undef %441.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_5]], implicit-def $scc ; CHECK-NEXT: %441.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %441, 0, 0 :: (invariant load (s32) from %ir.269, align 8, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0 :: (invariant load (s128) from %ir.154, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0 :: (invariant load (s128) from %ir.159, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) ; CHECK-NEXT: %71.sub3:sgpr_128 = S_MOV_B32 553734060 ; CHECK-NEXT: %71.sub2:sgpr_128 = S_MOV_B32 -1 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_128 = COPY %71 - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %273, 0, 0 :: (invariant load (s128) from %ir.167, addrspace 4) ; CHECK-NEXT: [[COPY13]].sub1:sgpr_128 = COPY %302.sub1 ; CHECK-NEXT: [[COPY13]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY13]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0 :: (invariant load (s128) from %ir.175, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %293, 0, 0 :: (invariant load (s128) from %ir.180, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc ; CHECK-NEXT: undef %453.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_6]], implicit-def $scc ; CHECK-NEXT: %453.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %453, 0, 0 :: (invariant load (s64) from %ir.277, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0 :: (invariant load (s128) from %ir.202, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0 :: (invariant load (s128) from %ir.208, addrspace 4) ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_128 = COPY %71 - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4) ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY14]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 ; CHECK-NEXT: [[COPY14]].sub1:sgpr_128 = COPY [[S_AND_B32_]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY14]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) ; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc ; CHECK-NEXT: undef %468.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_7]], implicit-def $scc ; CHECK-NEXT: %468.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %468, 0, 0 :: (invariant load (s64) from %ir.287, addrspace 4) ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY %71 ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY15]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 ; CHECK-NEXT: [[COPY15]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0 :: (invariant load (s128) from %ir.253, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4) - ; CHECK-NEXT: KILL %411.sub0, %411.sub1 - ; CHECK-NEXT: KILL undef %488:sreg_64 - ; CHECK-NEXT: KILL [[COPY15]].sub0_sub1, [[COPY15]].sub2_sub3 ; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %425, 0, 0 :: (invariant load (s128) from %ir.261, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc ; CHECK-NEXT: undef %485.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_8]], implicit-def $scc ; CHECK-NEXT: %485.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %485, 0, 0 :: (invariant load (s32) from %ir.298, align 8, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY %71 ; CHECK-NEXT: [[COPY16]].sub1:sgpr_128 = COPY [[S_AND_B32_2]] ; CHECK-NEXT: [[COPY16]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY16]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -475, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -491, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -507, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -539, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -475, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -491, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -507, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -539, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 96, implicit-def $scc ; CHECK-NEXT: [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %33:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef %514.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: %514.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %514, 0, 0 :: (invariant load (s128) from %ir.316, addrspace 4) + ; CHECK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -466, [[BUFFER_LOAD_FORMAT_X_IDXEN22]], 0, implicit $exec ; CHECK-NEXT: undef %522.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: %522.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %522, 0, 0 :: (invariant load (s128) from %ir.321, addrspace 4) - ; CHECK-NEXT: undef %530.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: %530.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %530, 0, 0 :: (invariant load (s128) from %ir.326, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]] - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]] - ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] - ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_U32_e64_]], [[V_ADD_U32_e64_1]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -4, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_]], [[V_ADD_U32_e64_2]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 27, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_1]], [[V_ADD_U32_e64_3]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_1:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 28, [[BUFFER_LOAD_DWORD_OFFSET]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_2]], [[V_SUBREV_U32_e64_]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_4:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_3]], [[V_SUBREV_U32_e64_1]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_5:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_SUB_I32_]], [[V_OR_B32_e64_4]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_6:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_SUB_I32_1]], [[V_OR_B32_e64_5]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_7:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_SUB_I32_2]], [[V_OR_B32_e64_6]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_2:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 32, [[BUFFER_LOAD_FORMAT_X_IDXEN2]], 0, implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_3:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 33, [[BUFFER_LOAD_FORMAT_X_IDXEN3]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_8:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_7]], [[V_SUBREV_U32_e64_2]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_4:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 34, [[BUFFER_LOAD_FORMAT_X_IDXEN4]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_9:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_8]], [[V_SUBREV_U32_e64_3]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_5:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 36, [[BUFFER_LOAD_FORMAT_X_IDXEN5]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_10:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_9]], [[V_SUBREV_U32_e64_4]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_6:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 37, [[BUFFER_LOAD_FORMAT_X_IDXEN6]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_11:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_10]], [[V_SUBREV_U32_e64_5]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_7:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 38, [[BUFFER_LOAD_FORMAT_X_IDXEN7]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_12:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_11]], [[V_SUBREV_U32_e64_6]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_8:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 39, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_13:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_12]], [[V_SUBREV_U32_e64_7]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 50, [[BUFFER_LOAD_FORMAT_X_IDXEN8]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_14:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_13]], [[V_SUBREV_U32_e64_8]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_10:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 51, [[BUFFER_LOAD_FORMAT_X_IDXEN9]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_15:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_14]], [[V_SUBREV_U32_e64_9]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_11:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 52, [[BUFFER_LOAD_FORMAT_X_IDXEN10]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_16:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_15]], [[V_SUBREV_U32_e64_10]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_12:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 53, [[BUFFER_LOAD_FORMAT_X_IDXEN11]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_17:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_16]], [[V_SUBREV_U32_e64_11]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -72, [[BUFFER_LOAD_FORMAT_X_IDXEN12]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_18:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_17]], [[V_SUBREV_U32_e64_12]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -73, [[BUFFER_LOAD_FORMAT_X_IDXEN13]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_19:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_18]], [[V_ADD_U32_e64_4]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -74, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_20:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_19]], [[V_ADD_U32_e64_5]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -75, [[BUFFER_LOAD_FORMAT_X_IDXEN14]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_21:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_20]], [[V_ADD_U32_e64_6]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -77, [[BUFFER_LOAD_FORMAT_X_IDXEN15]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_22:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_21]], [[V_ADD_U32_e64_7]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -93, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_23:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_22]], [[V_ADD_U32_e64_8]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -94, [[BUFFER_LOAD_FORMAT_X_IDXEN16]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_24:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_23]], [[V_ADD_U32_e64_9]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -95, [[BUFFER_LOAD_DWORD_OFFSET1]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_25:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_24]], [[V_ADD_U32_e64_10]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -96, [[BUFFER_LOAD_DWORD_OFFSET2]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_26:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_25]], [[V_ADD_U32_e64_11]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -97, [[BUFFER_LOAD_DWORD_OFFSET3]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_27:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_26]], [[V_ADD_U32_e64_12]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_28:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_27]], [[V_ADD_U32_e64_13]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_29:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_2]], [[V_OR_B32_e64_28]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_30:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_3]], [[V_OR_B32_e64_29]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_31:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_4]], [[V_OR_B32_e64_30]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -194, [[BUFFER_LOAD_FORMAT_X_IDXEN17]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_32:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_5]], [[V_OR_B32_e64_31]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -195, [[BUFFER_LOAD_FORMAT_X_IDXEN18]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_33:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_32]], [[V_ADD_U32_e64_14]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -196, [[BUFFER_LOAD_FORMAT_X_IDXEN19]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_34:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_33]], [[V_ADD_U32_e64_15]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -197, [[BUFFER_LOAD_FORMAT_X_IDXEN20]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_35:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_34]], [[V_ADD_U32_e64_16]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -216, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_36:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_35]], [[V_ADD_U32_e64_17]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_37:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_36]], [[V_ADD_U32_e64_18]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_37]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_39:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_8]], [[V_OR_B32_e64_38]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_40:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_9]], [[V_OR_B32_e64_39]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_41:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_10]], [[V_OR_B32_e64_40]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_41]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_42]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_43]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -457, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_44]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -458, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_46:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_45]], [[V_ADD_U32_e64_19]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -459, [[BUFFER_LOAD_FORMAT_X_IDXEN21]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_47:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_46]], [[V_ADD_U32_e64_20]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -466, [[BUFFER_LOAD_FORMAT_X_IDXEN22]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_48:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_47]], [[V_ADD_U32_e64_21]], implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_49:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_48]], [[V_ADD_U32_e64_22]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) ; CHECK-NEXT: [[V_OR_B32_e64_50:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_15]], [[V_OR_B32_e64_49]], implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_51:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_16]], [[V_OR_B32_e64_50]], implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_52:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_17]], [[V_OR_B32_e64_51]], implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_53:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_23]], [[V_OR_B32_e64_52]], implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_54:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_18]], [[V_OR_B32_e64_53]], implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_55:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_19]], [[V_OR_B32_e64_54]], implicit $exec + ; CHECK-NEXT: undef %530.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: %530.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[V_OR_B32_e64_56:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_20]], [[V_OR_B32_e64_55]], implicit $exec + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %530, 0, 0 :: (invariant load (s128) from %ir.326, addrspace 4) ; CHECK-NEXT: [[V_OR_B32_e64_57:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_21]], [[V_OR_B32_e64_56]], implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_58:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_22]], [[V_OR_B32_e64_57]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -555, [[BUFFER_LOAD_FORMAT_X_IDXEN23]], 0, implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -556, [[BUFFER_LOAD_FORMAT_X_IDXEN24]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_59:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_58]], [[V_ADD_U32_e64_23]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7) + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]] + ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] + ; CHECK-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -556, [[BUFFER_LOAD_FORMAT_X_IDXEN24]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -557, [[BUFFER_LOAD_FORMAT_X_IDXEN25]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_60:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_59]], [[V_ADD_U32_e64_24]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -574, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -574, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_61:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_60]], [[V_ADD_U32_e64_25]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -575, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -575, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_62:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_61]], [[V_ADD_U32_e64_26]], implicit $exec ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -576, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -576, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_63:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_62]], [[V_ADD_U32_e64_27]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -577, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -577, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %564:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec Index: llvm/test/CodeGen/AMDGPU/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -989,8 +989,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_sub_nc_i32 v0, v0, v16 clamp +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX10-NEXT: v_sub_nc_i32 v1, v1, v17 clamp ; GFX10-NEXT: v_sub_nc_i32 v2, v2, v18 clamp ; GFX10-NEXT: v_sub_nc_i32 v3, v3, v19 clamp @@ -1006,15 +1006,15 @@ ; GFX10-NEXT: v_sub_nc_i32 v13, v13, v29 clamp ; GFX10-NEXT: v_sub_nc_i32 v14, v14, v30 clamp ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_i32 v15, v15, v31 clamp +; GFX10-NEXT: v_sub_nc_i32 v15, v15, v16 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_ssubsat_v16i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: v_sub_nc_i32 v0, v0, v16 clamp +; GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GFX11-NEXT: v_sub_nc_i32 v1, v1, v17 clamp ; GFX11-NEXT: v_sub_nc_i32 v2, v2, v18 clamp ; GFX11-NEXT: v_sub_nc_i32 v3, v3, v19 clamp @@ -1030,7 +1030,7 @@ ; GFX11-NEXT: v_sub_nc_i32 v13, v13, v29 clamp ; GFX11-NEXT: v_sub_nc_i32 v14, v14, v30 clamp ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_nc_i32 v15, v15, v31 clamp +; GFX11-NEXT: v_sub_nc_i32 v15, v15, v16 clamp ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result @@ -1086,12 +1086,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 -; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr addrspace(1) %input, ptr addrspace(1) %output, i32 %i) { ; MUBUF-LABEL: kernel_background_evaluate: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_load_dword s0, s[0:1], 0x24 +; MUBUF-NEXT: s_load_dword s6, s[0:1], 0x24 ; MUBUF-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; MUBUF-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; MUBUF-NEXT: s_mov_b32 s38, -1 @@ -21,14 +21,14 @@ ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 ; MUBUF-NEXT: v_mov_b32_e32 v3, 0 ; MUBUF-NEXT: v_mov_b32_e32 v4, 0x400000 +; MUBUF-NEXT: s_mov_b64 s[0:1], s[36:37] +; MUBUF-NEXT: s_mov_b64 s[2:3], s[38:39] ; MUBUF-NEXT: s_mov_b32 s32, 0xc0000 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12 ; MUBUF-NEXT: s_waitcnt lgkmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v0, s0 -; MUBUF-NEXT: s_mov_b64 s[0:1], s[36:37] -; MUBUF-NEXT: s_mov_b64 s[2:3], s[38:39] +; MUBUF-NEXT: v_mov_b32_e32 v0, s6 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; MUBUF-NEXT: s_and_saveexec_b32 s0, vcc_lo Index: llvm/test/CodeGen/AMDGPU/store-local.128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -235,38 +235,38 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: s_lshr_b32 s3, s7, 24 ; GFX10-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_lshr_b32 s0, s6, 8 ; GFX10-NEXT: s_lshr_b32 s1, s6, 24 -; GFX10-NEXT: s_lshr_b32 s6, s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_lshr_b32 s2, s7, 8 -; GFX10-NEXT: s_lshr_b32 s4, s4, 24 -; GFX10-NEXT: v_mov_b32_e32 v8, s3 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: v_mov_b32_e32 v9, s6 -; GFX10-NEXT: s_lshr_b32 s0, s5, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: v_mov_b32_e32 v7, s2 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: ds_write_b8 v0, v2 offset:12 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:14 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_lshr_b32 s2, s7, 8 ; GFX10-NEXT: ds_write_b8 v0, v3 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2 -; GFX10-NEXT: ds_write_b8 v0, v4 offset:4 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:6 -; GFX10-NEXT: ds_write_b8 v0, v5 offset:9 -; GFX10-NEXT: ds_write_b8 v0, v6 offset:11 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:13 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: s_lshr_b32 s3, s7, 24 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_lshr_b32 s6, s4, 8 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:9 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:11 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: s_lshr_b32 s0, s4, 24 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:13 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:15 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_lshr_b32 s0, s5, 8 ; GFX10-NEXT: s_lshr_b32 s1, s5, 24 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:1 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: ds_write_b8 v0, v8 offset:15 -; GFX10-NEXT: ds_write_b8 v0, v9 offset:1 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX10-NEXT: ds_write_b8 v0, v2 offset:5 ; GFX10-NEXT: ds_write_b8 v0, v3 offset:7 @@ -404,15 +404,15 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 ; GFX10-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: ds_write_b16 v0, v2 offset:12 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:14 ; GFX10-NEXT: ds_write_b16 v0, v3 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:2 -; GFX10-NEXT: ds_write_b16 v0, v4 offset:4 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:6 +; GFX10-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align2: @@ -492,9 +492,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 ; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 -; GFX10-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: ds_write2_b32 v0, v3, v1 offset1:1 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align4: @@ -563,9 +563,9 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX10-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/store-local.96.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -205,27 +205,27 @@ ; GFX10-NEXT: s_lshr_b32 s0, s6, 8 ; GFX10-NEXT: s_lshr_b32 s1, s6, 24 ; GFX10-NEXT: s_lshr_b32 s2, s4, 8 -; GFX10-NEXT: s_lshr_b32 s3, s4, 24 -; GFX10-NEXT: s_lshr_b32 s4, s5, 8 -; GFX10-NEXT: s_lshr_b32 s5, s5, 24 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v5, s1 -; GFX10-NEXT: v_mov_b32_e32 v6, s2 -; GFX10-NEXT: v_mov_b32_e32 v7, s3 -; GFX10-NEXT: v_mov_b32_e32 v8, s4 -; GFX10-NEXT: v_mov_b32_e32 v9, s5 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: ds_write_b8 v0, v2 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-NEXT: ds_write_b8 v0, v3 offset:4 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 -; GFX10-NEXT: ds_write_b8 v0, v4 offset:9 -; GFX10-NEXT: ds_write_b8 v0, v5 offset:11 -; GFX10-NEXT: ds_write_b8 v0, v6 offset:1 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:3 -; GFX10-NEXT: ds_write_b8 v0, v8 offset:5 -; GFX10-NEXT: ds_write_b8 v0, v9 offset:7 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: s_lshr_b32 s3, s4, 24 +; GFX10-NEXT: s_lshr_b32 s4, s5, 8 +; GFX10-NEXT: s_lshr_b32 s5, s5, 24 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:11 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:1 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:3 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:5 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:7 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align1: Index: llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll +++ llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll @@ -188,12 +188,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_add_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f16_sdwa v2, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fadd_v4f16_fpexcept_strict: Index: llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -149,15 +149,15 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; GFX10-NEXT: v_fmac_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_fmac_f16_e32 v6, v8, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3 -; GFX10-NEXT: v_fmac_f16_e32 v9, v11, v10 +; GFX10-NEXT: v_fmac_f16_e32 v6, v8, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX10-NEXT: v_fmac_f16_e32 v4, v0, v2 ; GFX10-NEXT: v_perm_b32 v1, v6, v5, 0x5040100 -; GFX10-NEXT: v_perm_b32 v0, v9, v4, 0x5040100 +; GFX10-NEXT: v_fmac_f16_e32 v7, v9, v8 +; GFX10-NEXT: v_perm_b32 v0, v7, v4, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fma_v4f16_fpexcept_strict: Index: llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll +++ llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll @@ -291,12 +291,12 @@ ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SDAG-NEXT: v_mul_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-SDAG-NEXT: v_mul_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_mul_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-SDAG-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX10-SDAG-NEXT: v_mul_f16_sdwa v2, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-SDAG-NEXT: v_mul_f16_e32 v1, v1, v3 -; GFX10-SDAG-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX10-SDAG-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX10-SDAG-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-SDAG-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_constained_fmul_v4f16_fpexcept_strict: Index: llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -424,12 +424,12 @@ ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SDAG-NEXT: v_sub_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-SDAG-NEXT: v_sub_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_sub_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-SDAG-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX10-SDAG-NEXT: v_sub_f16_sdwa v2, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-SDAG-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-SDAG-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX10-SDAG-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX10-SDAG-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-SDAG-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_constained_fsub_v4f16_fpexcept_strict: Index: llvm/test/CodeGen/AMDGPU/uaddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -557,8 +557,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v16 clamp +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v17 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v18 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v19 clamp @@ -574,7 +574,7 @@ ; GFX10-NEXT: v_add_nc_u32_e64 v13, v13, v29 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v14, v14, v30 clamp ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v15, v15, v16 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result Index: llvm/test/CodeGen/AMDGPU/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -629,8 +629,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp ; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp ; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp @@ -646,15 +646,15 @@ ; GFX10-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp ; GFX10-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v16 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_usubsat_v16i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp +; GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GFX11-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp ; GFX11-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp ; GFX11-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp @@ -670,7 +670,7 @@ ; GFX11-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp ; GFX11-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v15, v15, v16 clamp ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result Index: llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1644,15 +1644,14 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: global_load_dwordx3 v[0:2], v[5:6], off +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: global_load_dword v7, v[3:4], off -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: global_load_dwordx3 v[0:2], v[5:6], off ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll +++ llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll @@ -161,63 +161,65 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_func, ptr %extern_func2) #0 { ; SI-LABEL: loop: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: v_mov_b32_e32 v0, v1 -; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6 +; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 ; SI-NEXT: s_mov_b32 s15, 0x31c16000 ; SI-NEXT: s_add_u32 s12, s12, s1 ; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_mov_b32 s32, 0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo ; SI-NEXT: s_xor_b32 s6, exec_lo, s0 -; SI-NEXT: s_cbranch_execz .LBB3_4 -; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b32 s6, s6 +; SI-NEXT: s_cbranch_execnz .LBB3_6 +; SI-NEXT: .LBB3_2: ; %end +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; SI-NEXT: s_branch .LBB3_9 +; SI-NEXT: .LBB3_3: ; %else ; SI-NEXT: s_mov_b32 s7, exec_lo -; SI-NEXT: .LBB3_2: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: .LBB3_4: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_readfirstlane_b32 s4, v4 ; SI-NEXT: v_readfirstlane_b32 s5, v5 ; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo +; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: s_mov_b64 s[0:1], s[12:13] ; SI-NEXT: s_mov_b64 s[2:3], s[14:15] ; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 -; SI-NEXT: s_cbranch_execnz .LBB3_2 -; SI-NEXT: ; %bb.3: +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: ; %bb.5: ; SI-NEXT: s_mov_b32 exec_lo, s7 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: .LBB3_4: ; %Flow ; SI-NEXT: s_andn2_saveexec_b32 s6, s6 -; SI-NEXT: s_cbranch_execz .LBB3_8 -; SI-NEXT: ; %bb.5: ; %if +; SI-NEXT: s_cbranch_execz .LBB3_2 +; SI-NEXT: .LBB3_6: ; %if ; SI-NEXT: s_mov_b32 s7, exec_lo -; SI-NEXT: .LBB3_6: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: .LBB3_7: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: v_readfirstlane_b32 s5, v3 ; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3] ; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo +; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: s_mov_b64 s[0:1], s[12:13] ; SI-NEXT: s_mov_b64 s[2:3], s[14:15] ; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 -; SI-NEXT: s_cbranch_execnz .LBB3_6 -; SI-NEXT: ; %bb.7: +; SI-NEXT: s_cbranch_execnz .LBB3_7 +; SI-NEXT: ; %bb.8: ; SI-NEXT: s_mov_b32 exec_lo, s7 -; SI-NEXT: .LBB3_8: ; %end ; SI-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; SI-NEXT: v_mov_b32_e32 v0, v1 -; SI-NEXT: ; return to shader part epilog +; SI-NEXT: s_branch .LBB3_9 +; SI-NEXT: .LBB3_9: main_body: %cc = icmp sgt i32 %z, 5 br i1 %cc, label %if, label %else