diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5689,12 +5689,13 @@ return getConstant(0, DL, VT); // Skip unnecessary zext_inreg pattern: - // (zext (trunc (assertzext x))) -> (assertzext x) + // (zext (trunc x)) -> x iff the upper bits are known zero. // TODO: Generalize to MaskedValueIsZero check? if (OpOpcode == ISD::TRUNCATE) { SDValue OpOp = N1.getOperand(0); if (OpOp.getValueType() == VT) { - if (OpOp.getOpcode() == ISD::AssertZext && N1->hasOneUse()) { + if ((OpOp.getOpcode() == ISD::AssertZext && N1->hasOneUse()) || + OpOp.getOpcode() == ISD::SRL) { APInt HiBits = APInt::getBitsSetFrom(VT.getScalarSizeInBits(), N1.getScalarValueSizeInBits()); if (MaskedValueIsZero(OpOp, HiBits)) { diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -1523,9 +1523,8 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s4, 16 -; VI-NEXT: v_cmp_ne_u16_e64 s[6:7], s5, 0 -; VI-NEXT: s_and_b64 vcc, exec, s[6:7] -; VI-NEXT: s_cbranch_vccz .LBB14_4 +; VI-NEXT: s_cmp_lg_u32 s5, 0 +; VI-NEXT: s_cbranch_scc0 .LBB14_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -8,7 +8,10 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x6060706 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v1 +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX10-NEXT: global_store_dword v[4:5], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -16,9 +19,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[2:3], off -; GFX9-NEXT: s_mov_b32 s4, 0x6060706 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -327,7 +332,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x4070706 +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v0 +; GFX10-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -335,9 +342,10 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x4070706 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -353,7 +361,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040706 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 ; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -361,9 +369,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x5040706 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_alignbit_b32 v0, v0, v0, 16 ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -406,7 +413,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -414,9 +422,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060706 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -458,7 +466,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -466,9 +475,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060706 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1504,70 +1513,74 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v0, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_bfe_i32 v0, v4, 0, 8 +; GFX10-NEXT: v_bfe_i32 v1, v4, 0, 8 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_ashrrev_i32_e32 v2, 24, v9 -; GFX10-NEXT: v_bfe_i32 v3, v4, 8, 8 -; GFX10-NEXT: v_bfe_i32 v1, v9, 16, 8 -; GFX10-NEXT: v_bfe_i32 v10, v4, 16, 8 -; GFX10-NEXT: v_cvt_f32_i32_e32 v13, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v11, 24, v4 -; GFX10-NEXT: v_xor_b32_e32 v15, v2, v3 -; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX10-NEXT: v_xor_b32_e32 v12, v1, v0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v13 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 24, v9 +; GFX10-NEXT: v_bfe_i32 v10, v4, 8, 8 +; GFX10-NEXT: v_bfe_i32 v2, v9, 16, 8 +; GFX10-NEXT: v_bfe_i32 v11, v4, 16, 8 ; GFX10-NEXT: v_cvt_f32_i32_e32 v14, v1 -; GFX10-NEXT: v_xor_b32_e32 v1, v1, v10 +; GFX10-NEXT: v_ashrrev_i32_e32 v12, 24, v4 +; GFX10-NEXT: v_xor_b32_e32 v16, v3, v10 ; GFX10-NEXT: v_cvt_f32_i32_e32 v10, v10 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v3 -; GFX10-NEXT: v_xor_b32_e32 v0, v0, v11 +; GFX10-NEXT: v_xor_b32_e32 v13, v2, v1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v14 +; GFX10-NEXT: v_cvt_f32_i32_e32 v15, v2 +; GFX10-NEXT: v_xor_b32_e32 v2, v2, v11 ; GFX10-NEXT: v_cvt_f32_i32_e32 v11, v11 -; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v10 -; GFX10-NEXT: v_ashrrev_i32_e32 v12, 30, v12 -; GFX10-NEXT: v_mul_f32_e32 v16, v14, v16 +; GFX10-NEXT: v_xor_b32_e32 v1, v1, v12 +; GFX10-NEXT: v_cvt_f32_i32_e32 v12, v12 +; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v11 -; GFX10-NEXT: v_ashrrev_i32_e32 v15, 30, v15 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 30, v1 -; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17 -; GFX10-NEXT: v_trunc_f32_e32 v16, v16 -; GFX10-NEXT: v_or_b32_e32 v12, 1, v12 -; GFX10-NEXT: v_or_b32_e32 v15, 1, v15 -; GFX10-NEXT: v_mul_f32_e32 v18, v14, v18 +; GFX10-NEXT: v_ashrrev_i32_e32 v13, 30, v13 +; GFX10-NEXT: v_mul_f32_e32 v17, v15, v17 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v20, v12 +; GFX10-NEXT: v_ashrrev_i32_e32 v16, 30, v16 +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 30, v2 +; GFX10-NEXT: v_mul_f32_e32 v18, v3, v18 ; GFX10-NEXT: v_trunc_f32_e32 v17, v17 -; GFX10-NEXT: v_mad_f32 v20, -v16, v13, v14 -; GFX10-NEXT: v_mul_f32_e32 v19, v13, v19 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX10-NEXT: v_or_b32_e32 v13, 1, v13 +; GFX10-NEXT: v_or_b32_e32 v16, 1, v16 +; GFX10-NEXT: v_mul_f32_e32 v19, v15, v19 ; GFX10-NEXT: v_trunc_f32_e32 v18, v18 -; GFX10-NEXT: v_mad_f32 v2, -v17, v3, v2 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v13| +; GFX10-NEXT: v_mad_f32 v21, -v17, v14, v15 +; GFX10-NEXT: v_mul_f32_e32 v20, v14, v20 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 30, v1 ; GFX10-NEXT: v_trunc_f32_e32 v19, v19 +; GFX10-NEXT: v_mad_f32 v3, -v18, v10, v3 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14| +; GFX10-NEXT: v_trunc_f32_e32 v20, v20 +; GFX10-NEXT: v_or_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_mad_f32 v15, -v19, v11, v15 ; GFX10-NEXT: v_or_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_mad_f32 v14, -v18, v10, v14 -; GFX10-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, |v3| -; GFX10-NEXT: v_mad_f32 v21, -v19, v11, v13 -; GFX10-NEXT: v_cvt_i32_f32_e32 v16, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v3|, |v10| +; GFX10-NEXT: v_mad_f32 v22, -v20, v12, v14 ; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17 ; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v15, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v14|, |v10| ; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v16, v12 -; GFX10-NEXT: v_add_nc_u32_sdwa v2, v17, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v16, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v15|, |v11| +; GFX10-NEXT: v_cvt_i32_f32_e32 v20, v20 +; GFX10-NEXT: v_add_nc_u32_e32 v10, v17, v13 +; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v12| +; GFX10-NEXT: v_lshlrev_b16 v4, 8, v4 +; GFX10-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u32_e32 v2, v19, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v11| -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_add_nc_u32_e32 v1, v18, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo -; GFX10-NEXT: v_add_nc_u32_sdwa v0, v19, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x60706 -; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dword v[5:6], v0, off -; GFX10-NEXT: global_store_dword v[7:8], v1, off +; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX10-NEXT: v_add_nc_u32_sdwa v1, v20, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_alignbit_b32 v0, v0, v9, 16 +; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dword v[5:6], v1, off +; GFX10-NEXT: global_store_dword v[7:8], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: sdiv_store_div: @@ -1579,71 +1592,74 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: global_load_dword v9, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x60706 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_i32 v1, v4, 0, 8 +; GFX9-NEXT: v_ashrrev_i32_e32 v10, 24, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 -; GFX9-NEXT: v_bfe_i32 v2, v9, 16, 8 -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 24, v9 -; GFX9-NEXT: v_bfe_i32 v9, v4, 8, 8 -; GFX9-NEXT: v_cvt_f32_i32_e32 v12, v1 -; GFX9-NEXT: v_bfe_i32 v10, v4, 16, 8 -; GFX9-NEXT: v_ashrrev_i32_e32 v4, 24, v4 -; GFX9-NEXT: v_xor_b32_e32 v14, v3, v9 -; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v9 -; GFX9-NEXT: v_xor_b32_e32 v11, v2, v1 +; GFX9-NEXT: v_bfe_i32 v2, v9, 0, 8 +; GFX9-NEXT: v_bfe_i32 v11, v9, 8, 8 ; GFX9-NEXT: v_cvt_f32_i32_e32 v13, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v10 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v9 +; GFX9-NEXT: v_bfe_i32 v3, v4, 16, 8 +; GFX9-NEXT: v_bfe_i32 v12, v9, 16, 8 +; GFX9-NEXT: v_ashrrev_i32_e32 v9, 24, v9 +; GFX9-NEXT: v_xor_b32_e32 v15, v10, v11 +; GFX9-NEXT: v_cvt_f32_i32_e32 v11, v11 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, v3, v2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v14, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, v3, v12 +; GFX9-NEXT: v_cvt_f32_i32_e32 v12, v12 +; GFX9-NEXT: v_xor_b32_e32 v2, v2, v9 +; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v9 +; GFX9-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX9-NEXT: v_ashrrev_i32_e32 v4, 30, v15 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v13 ; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v10 -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v12 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v9 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v10 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v4 -; GFX9-NEXT: v_mul_f32_e32 v15, v13, v15 -; GFX9-NEXT: v_mul_f32_e32 v16, v3, v16 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v11 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v12 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v9 +; GFX9-NEXT: v_mul_f32_e32 v15, v14, v15 +; GFX9-NEXT: v_mul_f32_e32 v16, v10, v16 ; GFX9-NEXT: v_trunc_f32_e32 v15, v15 -; GFX9-NEXT: v_ashrrev_i32_e32 v11, 30, v11 -; GFX9-NEXT: v_mul_f32_e32 v17, v13, v17 -; GFX9-NEXT: v_mul_f32_e32 v18, v12, v18 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 +; GFX9-NEXT: v_mul_f32_e32 v17, v14, v17 +; GFX9-NEXT: v_mul_f32_e32 v18, v13, v18 ; GFX9-NEXT: v_trunc_f32_e32 v16, v16 -; GFX9-NEXT: v_mad_f32 v19, -v15, v12, v13 -; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14 -; GFX9-NEXT: v_or_b32_e32 v11, 1, v11 +; GFX9-NEXT: v_mad_f32 v19, -v15, v13, v14 +; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_trunc_f32_e32 v17, v17 ; GFX9-NEXT: v_trunc_f32_e32 v18, v18 -; GFX9-NEXT: v_mad_f32 v3, -v16, v9, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v12| -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2 -; GFX9-NEXT: v_or_b32_e32 v14, 1, v14 +; GFX9-NEXT: v_mad_f32 v10, -v16, v11, v10 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v13| +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 30, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v15, v15 ; GFX9-NEXT: v_cvt_i32_f32_e32 v16, v16 -; GFX9-NEXT: v_mad_f32 v13, -v17, v10, v13 +; GFX9-NEXT: v_mad_f32 v14, -v17, v12, v14 ; GFX9-NEXT: v_cvt_i32_f32_e32 v17, v17 -; GFX9-NEXT: v_mad_f32 v20, -v18, v4, v12 +; GFX9-NEXT: v_mad_f32 v20, -v18, v9, v13 ; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v9| -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v10|, |v11| +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v14|, |v12| ; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v14, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v10| -; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, |v9| ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, |v4| -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_add_u32_e32 v4, v15, v11 -; GFX9-NEXT: v_add_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v2, v17, v2 -; GFX9-NEXT: v_add_u32_sdwa v1, v18, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v1, v15, v1 +; GFX9-NEXT: v_add_u32_sdwa v4, v16, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v3, v17, v3 +; GFX9-NEXT: v_add_u32_sdwa v2, v18, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v1, off ; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0)