diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2402,24 +2402,6 @@ return getConstant(NewVal, SDLoc(V), V.getValueType()); break; } - case ISD::SRL: - // Only look at single-use SRLs. - if (!V.getNode()->hasOneUse()) - break; - if (auto *RHSC = dyn_cast(V.getOperand(1))) { - // See if we can recursively simplify the LHS. - unsigned Amt = RHSC->getZExtValue(); - - // Watch out for shift count overflow though. - if (Amt >= DemandedBits.getBitWidth()) - break; - APInt SrcDemandedBits = DemandedBits << Amt; - if (SDValue SimplifyLHS = - GetDemandedBits(V.getOperand(0), SrcDemandedBits)) - return getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS, - V.getOperand(1)); - } - break; } return SDValue(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1517,6 +1517,16 @@ // low bits known zero. Known.Zero.setLowBits(ShAmt); + // Attempt to avoid multi-use ops if we don't need anything from them. + if (!InDemandedMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) { + SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( + Op0, InDemandedMask, DemandedElts, TLO.DAG, Depth + 1); + if (DemandedOp0) { + SDValue NewOp = TLO.DAG.getNode(ISD::SHL, dl, VT, DemandedOp0, Op1); + return TLO.CombineTo(Op, NewOp); + } + } + // Try shrinking the operation as long as the shift amount will still be // in range. if ((ShAmt < DemandedBits.getActiveBits()) && @@ -1586,6 +1596,16 @@ Known.One.lshrInPlace(ShAmt); // High bits known zero. Known.Zero.setHighBits(ShAmt); + + // Attempt to avoid multi-use ops if we don't need anything from them. + if (!InDemandedMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) { + SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( + Op0, InDemandedMask, DemandedElts, TLO.DAG, Depth + 1); + if (DemandedOp0) { + SDValue NewOp = TLO.DAG.getNode(ISD::SRL, dl, VT, DemandedOp0, Op1); + return TLO.CombineTo(Op, NewOp); + } + } } break; } diff --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll --- a/llvm/test/CodeGen/AArch64/parity.ll +++ b/llvm/test/CodeGen/AArch64/parity.ll @@ -47,8 +47,8 @@ ; CHECK-LABEL: parity_17: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0x1ffff -; CHECK-NEXT: eor w8, w8, w8, lsr #16 -; CHECK-NEXT: eor w8, w8, w8, lsr #8 +; CHECK-NEXT: eor w9, w8, w8, lsr #16 +; CHECK-NEXT: eor w8, w9, w8, lsr #8 ; CHECK-NEXT: eor w8, w8, w8, lsr #4 ; CHECK-NEXT: eor w8, w8, w8, lsr #2 ; CHECK-NEXT: eor w8, w8, w8, lsr #1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -5239,77 +5239,73 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s10, s2, 0xf0000 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 +; GFX6-NEXT: s_bfe_i32 s1, s0, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s10 +; GFX6-NEXT: s_xor_b32 s1, s10, s1 +; GFX6-NEXT: s_ashr_i32 s1, s1, 30 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX6-NEXT: s_or_b32 s1, s1, 1 +; GFX6-NEXT: v_mov_b32_e32 v7, s1 +; GFX6-NEXT: s_lshr_b32 s9, s0, 15 +; GFX6-NEXT: v_mul_f32_e32 v6, v5, v6 +; GFX6-NEXT: v_trunc_f32_e32 v6, v6 +; GFX6-NEXT: v_mad_f32 v5, -v6, v4, v5 +; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc +; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_mul_lo_u32 v4, v4, s0 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: s_movk_i32 s3, 0x7fff -; GFX6-NEXT: s_and_b32 s11, s0, s3 -; GFX6-NEXT: s_bfe_i32 s11, s11, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s11 -; GFX6-NEXT: s_and_b32 s9, s2, s3 -; GFX6-NEXT: s_bfe_i32 s9, s9, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_xor_b32 s9, s9, s11 -; GFX6-NEXT: s_ashr_i32 s9, s9, 30 -; GFX6-NEXT: s_or_b32 s9, s9, 1 -; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, s9 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: s_bfe_u32 s12, s0, 0xf000f -; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 -; GFX6-NEXT: s_lshr_b32 s1, s0, 15 -; GFX6-NEXT: s_bfe_i32 s0, s12, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 -; GFX6-NEXT: s_lshr_b32 s8, s2, 15 -; GFX6-NEXT: s_bfe_i32 s2, s10, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX6-NEXT: s_xor_b32 s0, s2, s0 +; GFX6-NEXT: s_xor_b32 s0, s1, s0 +; GFX6-NEXT: v_and_b32_e32 v3, s3, v2 +; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7 +; GFX6-NEXT: v_trunc_f32_e32 v7, v7 +; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s2, v4 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 ; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| +; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 +; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 ; GFX6-NEXT: s_or_b32 s0, s0, 1 -; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| -; GFX6-NEXT: v_mov_b32_e32 v6, s0 -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc -; GFX6-NEXT: v_bfe_i32 v4, v1, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v4 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX6-NEXT: v_bfe_i32 v6, v0, 0, 15 -; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v5 -; GFX6-NEXT: v_xor_b32_e32 v4, v6, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 30, v4 -; GFX6-NEXT: v_or_b32_e32 v4, 1, v4 -; GFX6-NEXT: v_mul_f32_e32 v6, v7, v8 -; GFX6-NEXT: v_trunc_f32_e32 v6, v6 -; GFX6-NEXT: v_mad_f32 v7, -v6, v5, v7 -; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v5| -; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_mul_lo_u32 v1, v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, s0 +; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc +; GFX6-NEXT: v_and_b32_e32 v1, s3, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, v7, v8 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_mad_f32 v7, -v2, v6, v7 +; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| +; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX6-NEXT: v_mul_lo_u32 v5, v5, s9 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 +; GFX6-NEXT: s_lshr_b32 s8, s2, 15 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s8, v5 ; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s3, v3 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_and_b32_e32 v3, s3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -5323,82 +5319,78 @@ ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: s_movk_i32 s8, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, s8 -; GFX9-NEXT: s_and_b32 s1, s6, s8 -; GFX9-NEXT: s_bfe_i32 s1, s1, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 -; GFX9-NEXT: s_bfe_i32 s0, s0, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX9-NEXT: s_bfe_i32 s1, s4, 0xf0000 +; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 ; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 -; GFX9-NEXT: s_or_b32 s11, s0, 1 -; GFX9-NEXT: s_lshr_b32 s9, s4, 15 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0xf000f +; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 +; GFX9-NEXT: v_trunc_f32_e32 v6, v6 +; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 +; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 +; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 +; GFX9-NEXT: s_or_b32 s9, s0, 1 +; GFX9-NEXT: s_lshr_b32 s5, s4, 15 ; GFX9-NEXT: s_lshr_b32 s7, s6, 15 -; GFX9-NEXT: s_bfe_u32 s10, s6, 0xf000f -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cselect_b32 s0, s11, 0 -; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 -; GFX9-NEXT: s_bfe_i32 s0, s10, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GFX9-NEXT: s_bfe_i32 s1, s5, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX9-NEXT: s_cselect_b32 s0, s9, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 +; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX9-NEXT: s_bfe_i32 s1, s4, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s1 ; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX9-NEXT: v_and_b32_e32 v3, s8, v1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s5, s0, 1 -; GFX9-NEXT: v_and_b32_e32 v1, s8, v1 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| +; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 +; GFX9-NEXT: v_trunc_f32_e32 v7, v7 +; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 +; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 +; GFX9-NEXT: s_or_b32 s6, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| +; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cselect_b32 s0, s5, 0 -; GFX9-NEXT: v_bfe_i32 v4, v1, 0, 15 -; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, v4 +; GFX9-NEXT: s_cselect_b32 s0, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v5, s0, v7 +; GFX9-NEXT: v_bfe_i32 v7, v0, 0, 15 +; GFX9-NEXT: v_cvt_f32_i32_e32 v8, v7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v6 +; GFX9-NEXT: v_xor_b32_e32 v1, v7, v1 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 +; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_mul_f32_e32 v7, v8, v9 +; GFX9-NEXT: v_trunc_f32_e32 v7, v7 +; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v7 +; GFX9-NEXT: v_mad_f32 v7, -v7, v6, v8 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v9, v1 +; GFX9-NEXT: v_mul_lo_u32 v5, v5, s7 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v4 ; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX9-NEXT: v_bfe_i32 v6, v0, 0, 15 -; GFX9-NEXT: v_cvt_f32_i32_e32 v7, v6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 -; GFX9-NEXT: v_xor_b32_e32 v4, v6, v4 -; GFX9-NEXT: v_ashrrev_i32_e32 v4, 30, v4 -; GFX9-NEXT: v_or_b32_e32 v4, 1, v4 -; GFX9-NEXT: v_mul_f32_e32 v6, v7, v8 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v6 -; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v7 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| -; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 -; GFX9-NEXT: v_add_u32_e32 v4, v8, v4 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s6 -; GFX9-NEXT: v_mul_lo_u32 v1, v4, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 -; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 -; GFX9-NEXT: v_sub_u32_e32 v2, s4, v2 +; GFX9-NEXT: v_sub_u32_e32 v4, s5, v5 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v4, s8, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_and_b32_e32 v2, s8, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 -; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX9-NEXT: global_store_dword v4, v0, s[2:3] +; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v4, v0, s[2:3] offset:4 +; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 ; GFX9-NEXT: s_endpgm %r = srem <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -463,10 +463,10 @@ ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v2i16: @@ -531,12 +531,12 @@ ; SI-NEXT: v_bfi_b32 v2, s4, v2, v7 ; SI-NEXT: v_and_b32_e32 v4, s5, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, s5, v3 +; SI-NEXT: v_and_b32_e32 v5, s5, v3 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -823,18 +823,16 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:2 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -1077,39 +1075,39 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v10, v[4:5] -; VI-NEXT: flat_load_ubyte v11, v[6:7] -; VI-NEXT: flat_load_ubyte v8, v[8:9] +; VI-NEXT: flat_load_ubyte v12, v[4:5] ; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 2, v0 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ubyte v8, v[8:9] +; VI-NEXT: flat_load_ubyte v9, v[10:11] ; VI-NEXT: flat_load_ubyte v6, v[6:7] ; VI-NEXT: flat_load_ubyte v4, v[4:5] -; VI-NEXT: flat_load_ubyte v2, v[2:3] +; VI-NEXT: flat_load_ubyte v7, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v12 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v8 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v3, v9 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v11 -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1118,32 +1116,31 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x5 -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX10-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v7, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_short_d16 v4, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_lshl_or_b32 v0, v3, 8, v1 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v3, v1 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v6 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v5 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 -; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 -; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 -; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid @@ -1416,18 +1413,16 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:2 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -1443,24 +1438,24 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v4, v[4:5] ; VI-NEXT: flat_load_ubyte v5, v[6:7] -; VI-NEXT: flat_load_ubyte v2, v[2:3] +; VI-NEXT: flat_load_ubyte v6, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -1469,23 +1464,22 @@ ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 ; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 8, v2 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -163,18 +163,18 @@ ; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v6 offset:4 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v7 offset:5 ; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v4 offset:2 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) ; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v5 offset:3 ; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v2 ; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v3 offset:1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v8 offset:6 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v0 offset:7 -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v6 offset:4 -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v7 offset:5 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds8align1: @@ -235,12 +235,12 @@ ; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:4 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(2) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) +; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:4 ; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2 offset:2 ; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(2) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:6 -; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:4 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds8align2: diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -687,14 +687,15 @@ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_or_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5 -; SI-NEXT: v_or_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, s4, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_and_b32_e32 v1, s4, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v2i16: @@ -887,18 +888,18 @@ ; SI-NEXT: v_or_b32_e32 v4, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4 -; SI-NEXT: v_or_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: s_mov_b32 s4, 0xffff -; SI-NEXT: v_alignbit_b32 v2, v2, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v2, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_and_b32_e32 v2, s4, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, s4, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, s4, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -962,37 +962,34 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v1, v2, 8, 8 -; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 +; GFX7-NEXT: v_bfe_i32 v5, v2, 0, 8 +; GFX7-NEXT: v_ashrrev_i32_e32 v6, 24, v2 +; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8 -; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v6 ; GFX7-NEXT: v_bfe_i32 v7, v0, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 +; GFX7-NEXT: v_ashrrev_i32_e32 v9, 24, v0 +; GFX7-NEXT: v_bfe_i32 v8, v0, 0, 8 +; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v9, v3, v9 +; GFX7-NEXT: v_and_b32_e32 v3, v3, v8 +; GFX7-NEXT: v_alignbit_b32 v2, 0, v2, 16 +; GFX7-NEXT: v_alignbit_b32 v0, 0, v0, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v8 -; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 -; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v3, v1 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v6, v1 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v6, v9, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1864,32 +1864,28 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b32 s4, 0xff00 ; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v3, s5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v6, s5, v0 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s5, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v8 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s5, v2 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v6, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v6, 0, v6, 16 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2163,18 +2159,16 @@ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v8, v4, v5 -; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v8, 16, v6 -; GFX9-NODL-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v7, v4, v5 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 8, v6 +; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v7 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 +; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v8 ; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; @@ -2193,18 +2187,16 @@ ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v4, v5 -; GFX9-DL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v6 -; GFX9-DL-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v4, v5 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v6 +; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v7 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -2223,24 +2215,23 @@ ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-DL-NEXT: v_lshrrev_b16 v8, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-DL-NEXT: v_lshrrev_b16 v9, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX10-DL-NEXT: v_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1 -; GFX10-DL-NEXT: v_mul_lo_u16 v9, v6, v7 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 +; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v9 +; GFX10-DL-NEXT: v_mul_lo_u16 v5, v7, v8 ; GFX10-DL-NEXT: v_lshlrev_b16 v4, 8, v4 -; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v8 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_lshlrev_b16 v5, 8, v5 -; GFX10-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6 +; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4 -; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5 -; GFX10-DL-NEXT: v_mad_u16 v1, v6, v7, v1 +; GFX10-DL-NEXT: v_mad_u16 v1, v7, v8, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2204,66 +2204,54 @@ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v1, v3, 20, 4 -; GFX7-NEXT: v_bfe_i32 v4, v3, 16, 4 -; GFX7-NEXT: v_bfe_i32 v5, v3, 4, 4 -; GFX7-NEXT: v_bfe_i32 v6, v3, 0, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_i32 v10, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v11, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 4, 4 -; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4 -; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; GFX7-NEXT: v_and_b32_e32 v11, v2, v13 -; GFX7-NEXT: v_bfe_i32 v7, v3, 24, 4 -; GFX7-NEXT: v_bfe_i32 v8, v3, 8, 4 +; GFX7-NEXT: v_bfe_i32 v1, v3, 24, 4 +; GFX7-NEXT: v_bfe_i32 v4, v3, 20, 4 +; GFX7-NEXT: v_bfe_i32 v5, v3, 8, 4 +; GFX7-NEXT: v_bfe_i32 v6, v3, 4, 4 +; GFX7-NEXT: v_bfe_i32 v7, v3, 16, 4 +; GFX7-NEXT: v_bfe_i32 v8, v3, 0, 4 ; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v3 ; GFX7-NEXT: v_bfe_i32 v3, v3, 12, 4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 24, 4 -; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4 +; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 +; GFX7-NEXT: v_bfe_i32 v13, v0, 4, 4 +; GFX7-NEXT: v_bfe_i32 v15, v0, 0, 4 ; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0 +; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 ; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX7-NEXT: v_or_b32_e32 v6, v11, v10 -; GFX7-NEXT: v_and_b32_e32 v12, v2, v14 -; GFX7-NEXT: v_and_b32_e32 v13, v2, v15 -; GFX7-NEXT: v_and_b32_e32 v14, v2, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v6 ; GFX7-NEXT: v_and_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_and_b32_e32 v9, v2, v9 +; GFX7-NEXT: v_and_b32_e32 v8, v2, v8 +; GFX7-NEXT: v_and_b32_e32 v7, v2, v7 ; GFX7-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_and_b32_e32 v4, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_and_b32_e32 v6, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v5 -; GFX7-NEXT: buffer_load_ushort v5, off, s[0:3], 0 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v16, v2, v16 +; GFX7-NEXT: v_and_b32_e32 v15, v2, v15 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v14 +; GFX7-NEXT: buffer_load_ushort v14, off, s[0:3], 0 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v13, s4, v13 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v12, s4, v12 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v4, v4, v6, v5 -; GFX7-NEXT: v_mad_u32_u24 v4, v16, v11, v4 -; GFX7-NEXT: v_mad_u32_u24 v4, v8, v13, v4 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v4 -; GFX7-NEXT: v_mad_u32_u24 v0, v1, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v15, v10, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v14, v0 +; GFX7-NEXT: v_mad_u32_u24 v8, v8, v15, v14 +; GFX7-NEXT: v_mad_u32_u24 v6, v6, v13, v8 +; GFX7-NEXT: v_mad_u32_u24 v5, v5, v12, v6 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v5 +; GFX7-NEXT: v_mad_u32_u24 v0, v7, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2813,95 +2801,59 @@ ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: v_mov_b32_e32 v2, 0xff -; GFX7-NEXT: s_mov_b32 s5, 0xffff -; GFX7-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_bfe_i32 v8, v2, 0, 4 +; GFX7-NEXT: v_bfe_i32 v4, v2, 24, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v6, v4, 20, 4 -; GFX7-NEXT: v_bfe_i32 v7, v4, 16, 4 -; GFX7-NEXT: v_bfe_i32 v8, v4, 12, 4 -; GFX7-NEXT: v_bfe_i32 v9, v4, 8, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_i32 v13, v0, 24, 4 -; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4 -; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX7-NEXT: v_or_b32_e32 v7, v9, v8 -; GFX7-NEXT: v_and_b32_e32 v9, v2, v13 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v16 -; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 -; GFX7-NEXT: v_bfe_i32 v5, v4, 24, 4 -; GFX7-NEXT: v_bfe_i32 v10, v4, 4, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v4 -; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX7-NEXT: v_ashrrev_i32_e32 v12, 28, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_and_b32_e32 v4, v2, v4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v15, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v17, v0, 8, 4 -; GFX7-NEXT: v_bfe_i32 v18, v0, 4, 4 -; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v11 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v14 -; GFX7-NEXT: v_and_b32_e32 v11, v2, v15 -; GFX7-NEXT: v_and_b32_e32 v14, v2, v17 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v18 -; GFX7-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v15 -; GFX7-NEXT: v_or_b32_e32 v8, v9, v8 -; GFX7-NEXT: v_or_b32_e32 v9, v11, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v6, s5, v6 -; GFX7-NEXT: v_or_b32_e32 v10, v14, v13 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GFX7-NEXT: v_bfe_i32 v15, v0, 0, 4 +; GFX7-NEXT: v_bfe_i32 v5, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 +; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v2 +; GFX7-NEXT: v_bfe_i32 v10, v2, 12, 4 +; GFX7-NEXT: v_bfe_i32 v2, v2, 4, 4 +; GFX7-NEXT: v_bfe_i32 v11, v0, 24, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 +; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4 +; GFX7-NEXT: v_bfe_i32 v17, v0, 12, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0 +; GFX7-NEXT: v_bfe_i32 v0, v0, 4, 4 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v15, v3, v15 +; GFX7-NEXT: v_and_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_and_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v4, s5, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v7 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v9 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v6 -; GFX7-NEXT: v_and_b32_e32 v7, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v13, v2, v0 -; GFX7-NEXT: v_bfe_u32 v8, v4, 8, 8 -; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_and_b32_e32 v12, v2, v12 -; GFX7-NEXT: v_and_b32_e32 v9, v2, v5 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_bfe_u32 v10, v5, 8, 8 -; GFX7-NEXT: v_bfe_u32 v15, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v7, v7, v13, v16 -; GFX7-NEXT: v_mad_u32_u24 v7, v8, v14, v7 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v0, v7 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v10, v15, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v1, v12, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v14, v3, v14 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; GFX7-NEXT: v_and_b32_e32 v9, v3, v9 +; GFX7-NEXT: v_and_b32_e32 v13, v3, v13 +; GFX7-NEXT: v_and_b32_e32 v3, v3, v16 +; GFX7-NEXT: v_alignbit_b32 v10, 0, v10, 24 +; GFX7-NEXT: v_alignbit_b32 v16, 0, v17, 24 +; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_mad_u32_u24 v0, v10, v16, v0 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v12, s4, v12 +; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v3, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -3028,6 +2980,8 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v2 @@ -3038,10 +2992,11 @@ ; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v2 ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v15 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16 @@ -3052,49 +3007,45 @@ ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 ; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 12, v10 -; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 -; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 +; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v13 +; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 +; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 12, v0 +; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v13 ; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX9-NEXT: v_mul_lo_u16_e32 v13, v16, v18 ; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v7, v8, v10 -; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-NEXT: v_mul_lo_u16_e32 v13, v16, v18 +; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17 ; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14 +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v1, v7, v4 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, v7, v4 +; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 ; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 ; GFX9-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v5 ; GFX9-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v9 +; GFX9-NEXT: v_add_u16_e32 v0, v0, v8 ; GFX9-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -3121,6 +3072,8 @@ ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v2 @@ -3131,10 +3084,11 @@ ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v2 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v9 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v15 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 @@ -3145,49 +3099,45 @@ ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v0 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v0, 12, v10 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v13 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v0, 12, v0 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v13 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v13, v16, v18 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v8, v10 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v13, v16, v18 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14 +; GFX9-DL-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v1, v7, v4 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v7, v4 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v2, v1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v9 +; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8 ; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -3214,70 +3164,69 @@ ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v0, 20, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v8, v8, v15 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v17 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v9, v16 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 8, v8 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v10, v15 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v0, v11 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v0, v11 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v10 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v11, v7, v14 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v5, v12 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 8, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v5, v12 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v9 -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v11, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v1, v3 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v11 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v13 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v9, v3, v10 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] @@ -3314,78 +3263,77 @@ ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v8, v8, v15 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v3 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v0, v9, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v17 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v9, v16 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v15 -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v9 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v11 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v7, v14 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 8, v10 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v18 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 8, v10 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v12, v5, v11 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v11, v7, v14 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v5, v12 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v9, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 8, v6 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v2, v1, v2 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v9, v2, v9 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v13 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v9, v2, v10 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v9, v8 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v11, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v12, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2108,10 +2108,10 @@ ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 @@ -2119,45 +2119,32 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v16, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xf0000 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_u32 v7, v2, 20, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 12, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v4, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v6, 15, v2 -; GFX7-NEXT: v_alignbit_b32 v2, v7, v2, 16 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v8 +; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4 +; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 12, v0 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v8 -; GFX7-NEXT: v_and_b32_e32 v13, 15, v0 -; GFX7-NEXT: v_or_b32_e32 v7, v13, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 15, v7 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v6, v6, v7, v16 -; GFX7-NEXT: v_bfe_u32 v12, v0, 8, 4 -; GFX7-NEXT: v_mad_u32_u24 v6, v8, v13, v6 -; GFX7-NEXT: v_bfe_u32 v14, v0, 20, 4 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 ; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v11, v0, 12, 4 -; GFX7-NEXT: v_alignbit_b32 v0, v14, v0, 16 -; GFX7-NEXT: v_mad_u32_u24 v5, v5, v12, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4 +; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: v_mad_u32_u24 v4, v4, v11, v5 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v4 -; GFX7-NEXT: v_mad_u32_u24 v0, v15, v14, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16 +; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -2477,78 +2464,48 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xf00 -; GFX7-NEXT: v_mov_b32_e32 v3, 0xf00 -; GFX7-NEXT: s_movk_i32 s5, 0xf0f +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf0f ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 28, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 4, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 12, v2 -; GFX7-NEXT: v_bfe_u32 v1, v2, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX7-NEXT: v_bfe_u32 v7, v2, 16, 4 -; GFX7-NEXT: v_alignbit_b32 v2, v6, v2, 24 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v9 +; GFX7-NEXT: v_and_b32_e32 v6, 15, v2 +; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 4, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_and_b32_e32 v6, v3, v9 -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v3, v3, v11 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX7-NEXT: v_and_b32_e32 v12, 15, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 28, v0 -; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 12, v0 -; GFX7-NEXT: v_bfe_u32 v14, v0, 16, 4 -; GFX7-NEXT: v_alignbit_b32 v0, v13, v0, 24 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v15 -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 -; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 15, v1 -; GFX7-NEXT: v_and_b32_e32 v12, 15, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_or_b32_e32 v4, v14, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 4 -; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v13, 15, v0 +; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 +; GFX7-NEXT: v_bfe_u32 v5, v2, 4, 4 +; GFX7-NEXT: v_bfe_u32 v12, v0, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v6, v6, v12, v16 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v3 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 4 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 4 -; GFX7-NEXT: v_mad_u32_u24 v6, v7, v13, v6 -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v6 -; GFX7-NEXT: v_and_b32_e32 v8, 15, v2 -; GFX7-NEXT: v_and_b32_e32 v14, 15, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v10, v1 -; GFX7-NEXT: v_bfe_u32 v9, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 -; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 4 -; GFX7-NEXT: v_mad_u32_u24 v1, v9, v15, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v11, v0, 8, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 28, v2 +; GFX7-NEXT: v_alignbit_b32 v7, 0, v7, 24 +; GFX7-NEXT: v_alignbit_b32 v14, 0, v14, 24 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 +; GFX7-NEXT: v_bfe_u32 v3, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v9, v2, 16, 4 +; GFX7-NEXT: v_alignbit_b32 v2, v8, v2, 24 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 28, v0 +; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_bfe_u32 v10, v0, 20, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v8, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v15, v0, 24 +; GFX7-NEXT: v_and_b32_e32 v16, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 +; GFX7-NEXT: v_bfe_u32 v13, v16, 8, 8 +; GFX7-NEXT: v_bfe_u32 v5, v6, 8, 8 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v11, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v13, v5, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2642,45 +2599,44 @@ ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_bfe_u32 v0, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 4 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_bfe_u32 v11, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v0, v1, 20, 4 ; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4 ; GFX9-NEXT: v_bfe_u32 v6, v1, 24, 4 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 28, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX9-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v11, v2, 16, 4 ; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4 ; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4 ; GFX9-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX9-NEXT: v_and_b32_e32 v10, 15, v1 ; GFX9-NEXT: v_bfe_u32 v16, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v13, v2, 24, 4 +; GFX9-NEXT: v_and_b32_e32 v10, 15, v1 ; GFX9-NEXT: v_and_b32_e32 v17, 15, v2 +; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v18, v5, v11 ; GFX9-NEXT: v_bfe_u32 v1, v1, 4, 4 ; GFX9-NEXT: v_bfe_u32 v2, v2, 4, 4 ; GFX9-NEXT: v_mul_lo_u16_e32 v12, v6, v13 ; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v18, v5, v11 -; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15 ; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v10, v10, v17 -; GFX9-NEXT: v_or_b32_e32 v7, v12, v7 +; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v18, v0 ; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_or_b32_e32 v1, v18, v0 -; GFX9-NEXT: v_or_b32_e32 v9, v10, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v10, v10, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-NEXT: v_or_b32_e32 v9, v10, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v1, v9, v4 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, v9, v4 +; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 ; GFX9-NEXT: v_add_u16_e32 v1, v1, v8 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 ; GFX9-NEXT: v_mad_legacy_u16 v0, v5, v11, v0 @@ -2708,45 +2664,44 @@ ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) +; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 24, 4 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 ; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 8, 4 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v1 ; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 24, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v2 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v5, v11 ; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 4, 4 ; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 4, 4 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v12, v6, v13 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v5, v11 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v10, v10, v17 -; GFX9-DL-NEXT: v_or_b32_e32 v7, v12, v7 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v12, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v0, v18, v0 ; GFX9-DL-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-DL-NEXT: v_or_b32_e32 v1, v18, v0 -; GFX9-DL-NEXT: v_or_b32_e32 v9, v10, v2 -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v10, v10, v17 +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-DL-NEXT: v_or_b32_e32 v9, v10, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v1, v9, v4 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v9, v4 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v2, v1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v5, v11, v0 @@ -2775,55 +2730,54 @@ ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX10-DL-NEXT: v_mul_lo_u16 v9, v9, v10 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v13 ; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 20, 4 +; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v9 ; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 24, 4 +; GFX10-DL-NEXT: v_mul_lo_u16 v10, v10, v12 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1 ; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v1 -; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9 +; GFX10-DL-NEXT: v_lshlrev_b16 v8, 8, v8 ; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 4, 4 -; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v14 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v14, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v15, 28, v2 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX10-DL-NEXT: v_mul_lo_u16 v1, v1, v15 -; GFX10-DL-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX10-DL-NEXT: v_mul_lo_u16 v9, v0, v10 -; GFX10-DL-NEXT: v_mul_lo_u16 v10, v6, v13 -; GFX10-DL-NEXT: v_lshlrev_b16 v7, 8, v7 +; GFX10-DL-NEXT: v_mul_lo_u16 v1, v1, v14 +; GFX10-DL-NEXT: v_or_b32_e32 v8, v10, v8 +; GFX10-DL-NEXT: v_mul_lo_u16 v13, v0, v13 +; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v15 ; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v2 ; GFX10-DL-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-DL-NEXT: v_mul_lo_u16 v11, v5, v12 -; GFX10-DL-NEXT: v_or_b32_e32 v7, v10, v7 -; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9 -; GFX10-DL-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_mul_lo_u16 v10, v5, v9 +; GFX10-DL-NEXT: v_mul_lo_u16 v11, v6, v12 +; GFX10-DL-NEXT: v_lshlrev_b16 v13, 8, v13 +; GFX10-DL-NEXT: v_lshlrev_b16 v7, 8, v7 +; GFX10-DL-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX10-DL-NEXT: v_or_b32_e32 v2, v11, v9 -; GFX10-DL-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v10 +; GFX10-DL-NEXT: v_or_b32_e32 v2, v10, v13 +; GFX10-DL-NEXT: v_or_b32_sdwa v10, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 8, v14 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v1, v3 -; GFX10-DL-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_add_nc_u16 v9, v3, v10 +; GFX10-DL-NEXT: v_or_b32_sdwa v1, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_add_nc_u16 v10, v3, v11 ; GFX10-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v0, v9, v8 +; GFX10-DL-NEXT: v_add_nc_u16 v0, v10, v8 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-NEXT: v_mad_u16 v0, v5, v12, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v5, v9, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v7 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v13, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v6, v12, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1608,10 +1608,10 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_mov_b64 s[2:3], 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_and_b32 s1, s4, s2 +; VI-NEXT: s_lshl_b32 s1, s4, 16 +; VI-NEXT: s_and_b32 s4, s4, s2 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b32 s0, s1, 16 -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_or_b32 s0, s4, s1 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3] @@ -1693,11 +1693,11 @@ ; VI-NEXT: s_mov_b64 s[2:3], 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_lshl_b32 s1, s5, 4 +; VI-NEXT: s_lshl_b32 s5, s4, 16 ; VI-NEXT: s_and_b32 s4, s4, s2 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 -; VI-NEXT: s_lshl_b32 s2, s4, 16 -; VI-NEXT: s_or_b32 s2, s4, s2 +; VI-NEXT: s_or_b32 s2, s4, s5 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -147,10 +147,11 @@ ; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 ; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -38,8 +38,9 @@ ; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -85,8 +86,9 @@ ; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -105,13 +107,10 @@ ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -122,13 +121,10 @@ ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm bb: @@ -147,13 +143,10 @@ ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -83,11 +83,11 @@ ; GCN-LABEL: v_shl_i128_vk: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_alignbit_b32 v4, v2, v1, 15 +; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], 17 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 15, v1 ; GCN-NEXT: v_alignbit_b32 v1, v1, v0, 15 -; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 15 +; GCN-NEXT: v_or_b32_e32 v2, v2, v4 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 17, v0 -; GCN-NEXT: v_mov_b32_e32 v2, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl i128 %lhs, 17 ret i128 %shl @@ -110,11 +110,11 @@ ; GCN-LABEL: v_ashr_i128_vk: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashr_i64 v[4:5], v[2:3], 33 -; GCN-NEXT: v_alignbit_b32 v0, v2, v1, 1 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 1 -; GCN-NEXT: v_mov_b32_e32 v2, v4 -; GCN-NEXT: v_mov_b32_e32 v3, v5 +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_lshl_b64 v[0:1], v[2:3], 31 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], 33 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = ashr i128 %lhs, 33 ret i128 %shl diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -147,10 +147,11 @@ ; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 ; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -141,10 +141,11 @@ ; SI-LABEL: trunc_v2i64_arg_to_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v0, s4, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, s4, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: trunc_v2i64_arg_to_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -94,9 +94,8 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 ; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -108,17 +108,17 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v4 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v3 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v2i16: diff --git a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll --- a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll @@ -91,18 +91,15 @@ ; BE-LABEL: i56_or: ; BE: @ %bb.0: ; BE-NEXT: mov r1, r0 -; BE-NEXT: ldr r12, [r0] +; BE-NEXT: ldr r0, [r0] ; BE-NEXT: ldrh r2, [r1, #4]! ; BE-NEXT: ldrb r3, [r1, #2] ; BE-NEXT: orr r2, r3, r2, lsl #8 -; BE-NEXT: orr r2, r2, r12, lsl #24 -; BE-NEXT: orr r2, r2, #384 -; BE-NEXT: strb r2, [r1, #2] -; BE-NEXT: lsr r3, r2, #8 -; BE-NEXT: strh r3, [r1] -; BE-NEXT: bic r1, r12, #255 -; BE-NEXT: orr r1, r1, r2, lsr #24 -; BE-NEXT: str r1, [r0] +; BE-NEXT: orr r0, r2, r0, lsl #24 +; BE-NEXT: orr r0, r0, #384 +; BE-NEXT: strb r0, [r1, #2] +; BE-NEXT: lsr r0, r0, #8 +; BE-NEXT: strh r0, [r1] ; BE-NEXT: mov pc, lr %aa = load i56, i56* %a %b = or i56 %aa, 384 @@ -121,20 +118,11 @@ ; ; BE-LABEL: i56_and_or: ; BE: @ %bb.0: -; BE-NEXT: mov r1, r0 +; BE-NEXT: ldrh r1, [r0, #4]! ; BE-NEXT: mov r2, #128 -; BE-NEXT: ldrh r12, [r1, #4]! -; BE-NEXT: ldrb r3, [r1, #2] -; BE-NEXT: strb r2, [r1, #2] -; BE-NEXT: orr r2, r3, r12, lsl #8 -; BE-NEXT: ldr r12, [r0] -; BE-NEXT: orr r2, r2, r12, lsl #24 -; BE-NEXT: orr r2, r2, #384 -; BE-NEXT: lsr r3, r2, #8 -; BE-NEXT: strh r3, [r1] -; BE-NEXT: bic r1, r12, #255 -; BE-NEXT: orr r1, r1, r2, lsr #24 -; BE-NEXT: str r1, [r0] +; BE-NEXT: orr r1, r1, #1 +; BE-NEXT: strb r2, [r0, #2] +; BE-NEXT: strh r1, [r0] ; BE-NEXT: mov pc, lr %b = load i56, i56* %a, align 1 @@ -155,22 +143,13 @@ ; ; BE-LABEL: i56_insert_bit: ; BE: @ %bb.0: -; BE-NEXT: .save {r11, lr} -; BE-NEXT: push {r11, lr} -; BE-NEXT: mov r2, r0 -; BE-NEXT: ldr lr, [r0] -; BE-NEXT: ldrh r12, [r2, #4]! -; BE-NEXT: ldrb r3, [r2, #2] -; BE-NEXT: orr r12, r3, r12, lsl #8 -; BE-NEXT: orr r3, r12, lr, lsl #24 -; BE-NEXT: bic r3, r3, #8192 -; BE-NEXT: orr r1, r3, r1, lsl #13 -; BE-NEXT: lsr r3, r1, #8 -; BE-NEXT: strh r3, [r2] -; BE-NEXT: bic r2, lr, #255 -; BE-NEXT: orr r1, r2, r1, lsr #24 -; BE-NEXT: str r1, [r0] -; BE-NEXT: pop {r11, lr} +; BE-NEXT: ldrh r2, [r0, #4]! +; BE-NEXT: mov r3, #57088 +; BE-NEXT: orr r3, r3, #16711680 +; BE-NEXT: and r2, r3, r2, lsl #8 +; BE-NEXT: orr r1, r2, r1, lsl #13 +; BE-NEXT: lsr r1, r1, #8 +; BE-NEXT: strh r1, [r0] ; BE-NEXT: mov pc, lr %extbit = zext i1 %bit to i56 %b = load i56, i56* %a, align 1 diff --git a/llvm/test/CodeGen/ARM/parity.ll b/llvm/test/CodeGen/ARM/parity.ll --- a/llvm/test/CodeGen/ARM/parity.ll +++ b/llvm/test/CodeGen/ARM/parity.ll @@ -47,8 +47,8 @@ ; CHECK-LABEL: parity_17: ; CHECK: @ %bb.0: ; CHECK-NEXT: bfc r0, #17, #15 -; CHECK-NEXT: eor r0, r0, r0, lsr #16 -; CHECK-NEXT: eor r0, r0, r0, lsr #8 +; CHECK-NEXT: eor r1, r0, r0, lsr #16 +; CHECK-NEXT: eor r0, r1, r0, lsr #8 ; CHECK-NEXT: eor r0, r0, r0, lsr #4 ; CHECK-NEXT: eor r0, r0, r0, lsr #2 ; CHECK-NEXT: eor r0, r0, r0, lsr #1 diff --git a/llvm/test/CodeGen/ARM/ror.ll b/llvm/test/CodeGen/ARM/ror.ll --- a/llvm/test/CodeGen/ARM/ror.ll +++ b/llvm/test/CodeGen/ARM/ror.ll @@ -21,8 +21,14 @@ define <2 x i32> @test2(<2 x i32> %x) nounwind readnone { ; CHECK-LABEL: test2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ror r0, r0, #10 -; CHECK-NEXT: ror r1, r1, #10 +; CHECK-NEXT: bic r2, r0, #15 +; CHECK-NEXT: ror r0, r0, #4 +; CHECK-NEXT: lsr r0, r0, #6 +; CHECK-NEXT: orr r0, r0, r2, lsl #22 +; CHECK-NEXT: bic r2, r1, #15 +; CHECK-NEXT: ror r1, r1, #4 +; CHECK-NEXT: lsr r1, r1, #6 +; CHECK-NEXT: orr r1, r1, r2, lsl #22 ; CHECK-NEXT: bx lr entry: %high_part.i = shl <2 x i32> %x, diff --git a/llvm/test/CodeGen/ARM/uxtb.ll b/llvm/test/CodeGen/ARM/uxtb.ll --- a/llvm/test/CodeGen/ARM/uxtb.ll +++ b/llvm/test/CodeGen/ARM/uxtb.ll @@ -103,11 +103,12 @@ ; CHECK-LABEL: test10: ; CHECK: @ %bb.0: ; CHECK-NEXT: mov r1, #248 +; CHECK-NEXT: mov r2, #7 ; CHECK-NEXT: orr r1, r1, #16252928 -; CHECK-NEXT: and r0, r1, r0, lsr #7 -; CHECK-NEXT: lsr r1, r0, #5 -; CHECK-NEXT: uxtb16 r1, r1 -; CHECK-NEXT: orr r0, r1, r0 +; CHECK-NEXT: orr r2, r2, #458752 +; CHECK-NEXT: and r1, r1, r0, lsr #7 +; CHECK-NEXT: and r0, r2, r0, lsr #12 +; CHECK-NEXT: orr r0, r0, r1 ; CHECK-NEXT: bx lr %tmp1 = lshr i32 %p0, 7 %tmp2 = and i32 %tmp1, 16253176 diff --git a/llvm/test/CodeGen/Mips/funnel-shift.ll b/llvm/test/CodeGen/Mips/funnel-shift.ll --- a/llvm/test/CodeGen/Mips/funnel-shift.ll +++ b/llvm/test/CodeGen/Mips/funnel-shift.ll @@ -80,8 +80,8 @@ ; CHECK-BE-NEXT: andi $5, $1, 63 ; CHECK-BE-NEXT: srl $7, $16, 5 ; CHECK-BE-NEXT: sll $8, $17, 27 -; CHECK-BE-NEXT: or $7, $8, $7 -; CHECK-BE-NEXT: srl $8, $7, 1 +; CHECK-BE-NEXT: or $8, $8, $7 +; CHECK-BE-NEXT: srl $8, $8, 1 ; CHECK-BE-NEXT: srlv $9, $8, $5 ; CHECK-BE-NEXT: andi $1, $1, 32 ; CHECK-BE-NEXT: move $10, $9 @@ -143,8 +143,8 @@ ; CHECK-LE-NEXT: andi $5, $1, 63 ; CHECK-LE-NEXT: srl $7, $17, 5 ; CHECK-LE-NEXT: sll $8, $16, 27 -; CHECK-LE-NEXT: or $7, $8, $7 -; CHECK-LE-NEXT: srl $8, $7, 1 +; CHECK-LE-NEXT: or $8, $8, $7 +; CHECK-LE-NEXT: srl $8, $8, 1 ; CHECK-LE-NEXT: srlv $9, $8, $5 ; CHECK-LE-NEXT: andi $1, $1, 32 ; CHECK-LE-NEXT: move $10, $9 diff --git a/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll @@ -170,14 +170,14 @@ ; MIPSEL-NEXT: lui $2, 60010 ; MIPSEL-NEXT: ori $2, $2, 61135 ; MIPSEL-NEXT: sltu $1, $1, $2 -; MIPSEL-NEXT: srl $2, $4, 1 -; MIPSEL-NEXT: andi $3, $3, 3 -; MIPSEL-NEXT: sll $4, $3, 31 -; MIPSEL-NEXT: or $4, $2, $4 +; MIPSEL-NEXT: sll $2, $3, 31 +; MIPSEL-NEXT: srl $4, $4, 1 +; MIPSEL-NEXT: or $4, $4, $2 ; MIPSEL-NEXT: sltiu $2, $4, 13 ; MIPSEL-NEXT: xori $4, $4, 13 ; MIPSEL-NEXT: movz $2, $1, $4 ; MIPSEL-NEXT: sll $1, $5, 1 +; MIPSEL-NEXT: andi $3, $3, 2 ; MIPSEL-NEXT: srl $3, $3, 1 ; MIPSEL-NEXT: or $1, $3, $1 ; MIPSEL-NEXT: andi $1, $1, 3 @@ -205,12 +205,12 @@ ; MIPS64EL-NEXT: daddiu $5, $5, -4401 ; MIPS64EL-NEXT: dsll $4, $4, 1 ; MIPS64EL-NEXT: daddu $3, $3, $4 -; MIPS64EL-NEXT: daddu $2, $3, $2 -; MIPS64EL-NEXT: andi $3, $2, 3 +; MIPS64EL-NEXT: daddu $3, $3, $2 ; MIPS64EL-NEXT: dsll $2, $3, 63 ; MIPS64EL-NEXT: dsrl $4, $1, 1 ; MIPS64EL-NEXT: or $2, $4, $2 ; MIPS64EL-NEXT: sltu $2, $2, $5 +; MIPS64EL-NEXT: andi $3, $3, 2 ; MIPS64EL-NEXT: dsrl $3, $3, 1 ; MIPS64EL-NEXT: dsll $1, $1, 1 ; MIPS64EL-NEXT: or $1, $3, $1 diff --git a/llvm/test/CodeGen/PowerPC/fp-to-int-to-fp.ll b/llvm/test/CodeGen/PowerPC/fp-to-int-to-fp.ll --- a/llvm/test/CodeGen/PowerPC/fp-to-int-to-fp.ll +++ b/llvm/test/CodeGen/PowerPC/fp-to-int-to-fp.ll @@ -84,35 +84,35 @@ ; PPC64-NEXT: addi 3, 5, 0 ; PPC64-NEXT: .LBB2_2: # %entry ; PPC64-NEXT: sradi 4, 3, 53 -; PPC64-NEXT: clrldi 5, 3, 63 +; PPC64-NEXT: rldicl 5, 3, 63, 1 ; PPC64-NEXT: addi 4, 4, 1 +; PPC64-NEXT: clrldi 6, 3, 63 ; PPC64-NEXT: cmpldi 4, 1 -; PPC64-NEXT: rldicl 4, 3, 63, 1 -; PPC64-NEXT: or 5, 5, 4 -; PPC64-NEXT: rldicl 6, 5, 11, 53 -; PPC64-NEXT: addi 6, 6, 1 -; PPC64-NEXT: clrldi 7, 5, 53 -; PPC64-NEXT: cmpldi 1, 6, 1 -; PPC64-NEXT: clrldi 6, 3, 53 +; PPC64-NEXT: clrldi 4, 3, 53 +; PPC64-NEXT: or 6, 6, 5 +; PPC64-NEXT: clrldi 7, 6, 53 +; PPC64-NEXT: addi 4, 4, 2047 ; PPC64-NEXT: addi 7, 7, 2047 -; PPC64-NEXT: addi 6, 6, 2047 -; PPC64-NEXT: or 4, 7, 4 -; PPC64-NEXT: or 6, 6, 3 -; PPC64-NEXT: rldicl 4, 4, 53, 11 -; PPC64-NEXT: rldicr 6, 6, 0, 52 +; PPC64-NEXT: or 4, 4, 3 +; PPC64-NEXT: or 5, 7, 5 +; PPC64-NEXT: rldicl 7, 3, 10, 54 +; PPC64-NEXT: rldicr 4, 4, 0, 52 +; PPC64-NEXT: addi 7, 7, 1 ; PPC64-NEXT: bc 12, 1, .LBB2_4 ; PPC64-NEXT: # %bb.3: # %entry -; PPC64-NEXT: ori 6, 3, 0 +; PPC64-NEXT: ori 4, 3, 0 ; PPC64-NEXT: b .LBB2_4 ; PPC64-NEXT: .LBB2_4: # %entry -; PPC64-NEXT: rldicl 4, 4, 11, 1 -; PPC64-NEXT: cmpdi 3, 0 -; PPC64-NEXT: std 6, -32(1) -; PPC64-NEXT: bc 12, 5, .LBB2_6 +; PPC64-NEXT: rldicl 5, 5, 53, 11 +; PPC64-NEXT: std 4, -32(1) +; PPC64-NEXT: rldicl 4, 5, 11, 1 +; PPC64-NEXT: cmpldi 7, 1 +; PPC64-NEXT: bc 12, 1, .LBB2_6 ; PPC64-NEXT: # %bb.5: # %entry -; PPC64-NEXT: ori 4, 5, 0 +; PPC64-NEXT: ori 4, 6, 0 ; PPC64-NEXT: b .LBB2_6 ; PPC64-NEXT: .LBB2_6: # %entry +; PPC64-NEXT: cmpdi 3, 0 ; PPC64-NEXT: std 4, -24(1) ; PPC64-NEXT: bc 12, 0, .LBB2_8 ; PPC64-NEXT: # %bb.7: # %entry diff --git a/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll --- a/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll @@ -54,7 +54,7 @@ ; ; RV64I-LABEL: test_bswap_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a1, a0, 8 +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 @@ -1128,7 +1128,7 @@ ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: srli a1, a1, 32 -; RV64I-NEXT: srliw a0, a0, 1 +; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a0, a0, a2 diff --git a/llvm/test/CodeGen/RISCV/rv32zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbp.ll --- a/llvm/test/CodeGen/RISCV/rv32zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbp.ll @@ -905,23 +905,47 @@ ; RV32I-NEXT: addi a4, a4, 819 ; RV32I-NEXT: and a3, a3, a4 ; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 2 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: srli a2, a0, 2 +; RV32I-NEXT: or a1, a0, a1 +; RV32I-NEXT: slli a0, a0, 2 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: srli a2, a1, 2 ; RV32I-NEXT: and a2, a2, a4 -; RV32I-NEXT: or a0, a2, a0 -; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: ret ; ; RV32IB-LABEL: gorc2b_i32: ; RV32IB: # %bb.0: +; RV32IB-NEXT: srli a1, a0, 2 +; RV32IB-NEXT: or a1, a1, a0 ; RV32IB-NEXT: orc2.n a0, a0 +; RV32IB-NEXT: slli a1, a1, 2 +; RV32IB-NEXT: lui a2, 838861 +; RV32IB-NEXT: addi a2, a2, -820 +; RV32IB-NEXT: and a1, a1, a2 +; RV32IB-NEXT: srli a2, a0, 2 +; RV32IB-NEXT: lui a3, 209715 +; RV32IB-NEXT: addi a3, a3, 819 +; RV32IB-NEXT: and a2, a2, a3 +; RV32IB-NEXT: or a0, a2, a0 +; RV32IB-NEXT: or a0, a0, a1 ; RV32IB-NEXT: ret ; ; RV32IBP-LABEL: gorc2b_i32: ; RV32IBP: # %bb.0: +; RV32IBP-NEXT: srli a1, a0, 2 +; RV32IBP-NEXT: or a1, a1, a0 ; RV32IBP-NEXT: orc2.n a0, a0 +; RV32IBP-NEXT: slli a1, a1, 2 +; RV32IBP-NEXT: lui a2, 838861 +; RV32IBP-NEXT: addi a2, a2, -820 +; RV32IBP-NEXT: and a1, a1, a2 +; RV32IBP-NEXT: srli a2, a0, 2 +; RV32IBP-NEXT: lui a3, 209715 +; RV32IBP-NEXT: addi a3, a3, 819 +; RV32IBP-NEXT: and a2, a2, a3 +; RV32IBP-NEXT: or a0, a2, a0 +; RV32IBP-NEXT: or a0, a0, a1 ; RV32IBP-NEXT: ret %and1 = shl i32 %a, 2 %shl1 = and i32 %and1, -858993460 @@ -942,46 +966,88 @@ define i64 @gorc2b_i64(i64 %a) nounwind { ; RV32I-LABEL: gorc2b_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: slli a2, a1, 2 -; RV32I-NEXT: slli a3, a0, 2 +; RV32I-NEXT: slli a2, a0, 2 +; RV32I-NEXT: slli a3, a1, 2 ; RV32I-NEXT: lui a4, 838861 ; RV32I-NEXT: addi a4, a4, -820 ; RV32I-NEXT: and a6, a3, a4 ; RV32I-NEXT: and a7, a2, a4 -; RV32I-NEXT: srli a5, a0, 2 -; RV32I-NEXT: srli a3, a1, 2 +; RV32I-NEXT: srli a5, a1, 2 +; RV32I-NEXT: srli a3, a0, 2 ; RV32I-NEXT: lui a2, 209715 ; RV32I-NEXT: addi a2, a2, 819 ; RV32I-NEXT: and a3, a3, a2 ; RV32I-NEXT: and a5, a5, a2 -; RV32I-NEXT: or a0, a5, a0 -; RV32I-NEXT: or a1, a3, a1 -; RV32I-NEXT: or a1, a1, a7 -; RV32I-NEXT: or a0, a0, a6 -; RV32I-NEXT: slli a3, a0, 2 -; RV32I-NEXT: slli a5, a1, 2 -; RV32I-NEXT: and a6, a5, a4 -; RV32I-NEXT: and a3, a3, a4 -; RV32I-NEXT: srli a4, a1, 2 -; RV32I-NEXT: srli a5, a0, 2 -; RV32I-NEXT: and a5, a5, a2 +; RV32I-NEXT: or a1, a5, a1 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: or a3, a0, a7 +; RV32I-NEXT: or a5, a1, a6 +; RV32I-NEXT: slli a0, a0, 2 +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: and a6, a1, a4 +; RV32I-NEXT: and a0, a0, a4 +; RV32I-NEXT: srli a4, a5, 2 +; RV32I-NEXT: srli a1, a3, 2 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a2, a4, a2 -; RV32I-NEXT: or a1, a2, a1 -; RV32I-NEXT: or a0, a5, a0 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a2, a2, a5 +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: or a1, a2, a6 ; RV32I-NEXT: ret ; ; RV32IB-LABEL: gorc2b_i64: ; RV32IB: # %bb.0: -; RV32IB-NEXT: orc2.n a0, a0 +; RV32IB-NEXT: srli a2, a1, 2 +; RV32IB-NEXT: srli a3, a0, 2 +; RV32IB-NEXT: lui a4, 209715 +; RV32IB-NEXT: addi a4, a4, 819 +; RV32IB-NEXT: and a3, a3, a4 +; RV32IB-NEXT: or a3, a3, a0 +; RV32IB-NEXT: or a2, a2, a1 ; RV32IB-NEXT: orc2.n a1, a1 +; RV32IB-NEXT: orc2.n a0, a0 +; RV32IB-NEXT: slli a2, a2, 2 +; RV32IB-NEXT: slli a3, a3, 2 +; RV32IB-NEXT: lui a5, 838861 +; RV32IB-NEXT: addi a5, a5, -820 +; RV32IB-NEXT: and a6, a3, a5 +; RV32IB-NEXT: and a2, a2, a5 +; RV32IB-NEXT: srli a5, a0, 2 +; RV32IB-NEXT: srli a3, a1, 2 +; RV32IB-NEXT: and a3, a3, a4 +; RV32IB-NEXT: and a4, a5, a4 +; RV32IB-NEXT: or a0, a4, a0 +; RV32IB-NEXT: or a1, a3, a1 +; RV32IB-NEXT: or a1, a1, a2 +; RV32IB-NEXT: or a0, a0, a6 ; RV32IB-NEXT: ret ; ; RV32IBP-LABEL: gorc2b_i64: ; RV32IBP: # %bb.0: -; RV32IBP-NEXT: orc2.n a0, a0 +; RV32IBP-NEXT: srli a2, a1, 2 +; RV32IBP-NEXT: srli a3, a0, 2 +; RV32IBP-NEXT: lui a4, 209715 +; RV32IBP-NEXT: addi a4, a4, 819 +; RV32IBP-NEXT: and a3, a3, a4 +; RV32IBP-NEXT: or a3, a3, a0 +; RV32IBP-NEXT: or a2, a2, a1 ; RV32IBP-NEXT: orc2.n a1, a1 +; RV32IBP-NEXT: orc2.n a0, a0 +; RV32IBP-NEXT: slli a2, a2, 2 +; RV32IBP-NEXT: slli a3, a3, 2 +; RV32IBP-NEXT: lui a5, 838861 +; RV32IBP-NEXT: addi a5, a5, -820 +; RV32IBP-NEXT: and a6, a3, a5 +; RV32IBP-NEXT: and a2, a2, a5 +; RV32IBP-NEXT: srli a5, a0, 2 +; RV32IBP-NEXT: srli a3, a1, 2 +; RV32IBP-NEXT: and a3, a3, a4 +; RV32IBP-NEXT: and a4, a5, a4 +; RV32IBP-NEXT: or a0, a4, a0 +; RV32IBP-NEXT: or a1, a3, a1 +; RV32IBP-NEXT: or a1, a1, a2 +; RV32IBP-NEXT: or a0, a0, a6 ; RV32IBP-NEXT: ret %and1 = shl i64 %a, 2 %shl1 = and i64 %and1, -3689348814741910324 @@ -2676,21 +2742,18 @@ define i32 @bswap_rotr_i32(i32 %a) { ; RV32I-LABEL: bswap_rotr_i32: ; RV32I: # %bb.0: -; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: lui a2, 16 -; RV32I-NEXT: addi a2, a2, -256 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: lui a2, 4080 ; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: slli a2, a0, 24 +; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: srli a2, a0, 24 -; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: slli a2, a0, 8 -; RV32I-NEXT: lui a3, 4080 -; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: andi a0, a0, -256 ; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: srli a0, a0, 16 -; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srli a1, a1, 16 +; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: ret ; ; RV32IB-LABEL: bswap_rotr_i32: @@ -2710,21 +2773,18 @@ define i32 @bswap_rotl_i32(i32 %a) { ; RV32I-LABEL: bswap_rotl_i32: ; RV32I: # %bb.0: -; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: lui a2, 16 -; RV32I-NEXT: addi a2, a2, -256 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: srli a2, a0, 24 -; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: srli a2, a0, 8 +; RV32I-NEXT: andi a2, a2, -256 +; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: slli a2, a0, 8 ; RV32I-NEXT: lui a3, 4080 ; RV32I-NEXT: and a2, a2, a3 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: slli a0, a0, 16 -; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: ret ; ; RV32IB-LABEL: bswap_rotl_i32: diff --git a/llvm/test/CodeGen/RISCV/rv64zbb-zbp.ll b/llvm/test/CodeGen/RISCV/rv64zbb-zbp.ll --- a/llvm/test/CodeGen/RISCV/rv64zbb-zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb-zbp.ll @@ -574,41 +574,41 @@ ; RV64I-LABEL: roriw_bug: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a0, 31 -; RV64I-NEXT: andi a0, a0, -2 -; RV64I-NEXT: srli a2, a0, 1 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: sext.w a1, a1 -; RV64I-NEXT: xor a0, a0, a1 +; RV64I-NEXT: andi a2, a0, -2 +; RV64I-NEXT: srli a0, a0, 1 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: xor a0, a2, a0 ; RV64I-NEXT: ret ; ; RV64IB-LABEL: roriw_bug: ; RV64IB: # %bb.0: ; RV64IB-NEXT: slli a1, a0, 31 -; RV64IB-NEXT: andi a0, a0, -2 -; RV64IB-NEXT: srli a2, a0, 1 -; RV64IB-NEXT: or a1, a1, a2 -; RV64IB-NEXT: sext.w a1, a1 -; RV64IB-NEXT: xor a0, a0, a1 +; RV64IB-NEXT: andi a2, a0, -2 +; RV64IB-NEXT: srli a0, a0, 1 +; RV64IB-NEXT: or a0, a1, a0 +; RV64IB-NEXT: sext.w a0, a0 +; RV64IB-NEXT: xor a0, a2, a0 ; RV64IB-NEXT: ret ; ; RV64IBB-LABEL: roriw_bug: ; RV64IBB: # %bb.0: ; RV64IBB-NEXT: slli a1, a0, 31 -; RV64IBB-NEXT: andi a0, a0, -2 -; RV64IBB-NEXT: srli a2, a0, 1 -; RV64IBB-NEXT: or a1, a1, a2 -; RV64IBB-NEXT: sext.w a1, a1 -; RV64IBB-NEXT: xor a0, a0, a1 +; RV64IBB-NEXT: andi a2, a0, -2 +; RV64IBB-NEXT: srli a0, a0, 1 +; RV64IBB-NEXT: or a0, a1, a0 +; RV64IBB-NEXT: sext.w a0, a0 +; RV64IBB-NEXT: xor a0, a2, a0 ; RV64IBB-NEXT: ret ; ; RV64IBP-LABEL: roriw_bug: ; RV64IBP: # %bb.0: ; RV64IBP-NEXT: slli a1, a0, 31 -; RV64IBP-NEXT: andi a0, a0, -2 -; RV64IBP-NEXT: srli a2, a0, 1 -; RV64IBP-NEXT: or a1, a1, a2 -; RV64IBP-NEXT: sext.w a1, a1 -; RV64IBP-NEXT: xor a0, a0, a1 +; RV64IBP-NEXT: andi a2, a0, -2 +; RV64IBP-NEXT: srli a0, a0, 1 +; RV64IBP-NEXT: or a0, a1, a0 +; RV64IBP-NEXT: sext.w a0, a0 +; RV64IBP-NEXT: xor a0, a2, a0 ; RV64IBP-NEXT: ret %a = shl i64 %x, 31 %b = and i64 %x, 18446744073709551614 diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -957,7 +957,7 @@ ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: srli a1, a1, 32 -; RV64I-NEXT: srliw a0, a0, 1 +; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a0, a0, a2 @@ -1506,7 +1506,7 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind { ; RV64I-LABEL: bswap_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a1, a0, 8 +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 @@ -1539,7 +1539,7 @@ define void @bswap_i32_nosext(i32 signext %a, i32* %x) nounwind { ; RV64I-LABEL: bswap_i32_nosext: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: srli a2, a0, 8 ; RV64I-NEXT: lui a3, 16 ; RV64I-NEXT: addiw a3, a3, -256 ; RV64I-NEXT: and a2, a2, a3 diff --git a/llvm/test/CodeGen/RISCV/rv64zbp.ll b/llvm/test/CodeGen/RISCV/rv64zbp.ll --- a/llvm/test/CodeGen/RISCV/rv64zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbp.ll @@ -1000,24 +1000,50 @@ ; RV64I-NEXT: addiw a4, a4, 819 ; RV64I-NEXT: and a3, a3, a4 ; RV64I-NEXT: or a0, a3, a0 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 2 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: srli a2, a0, 2 +; RV64I-NEXT: or a1, a0, a1 +; RV64I-NEXT: slli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: srli a2, a1, 2 ; RV64I-NEXT: and a2, a2, a4 -; RV64I-NEXT: or a0, a2, a0 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: ret ; ; RV64IB-LABEL: gorc2b_i32: ; RV64IB: # %bb.0: +; RV64IB-NEXT: srliw a1, a0, 2 +; RV64IB-NEXT: or a1, a1, a0 ; RV64IB-NEXT: gorciw a0, a0, 2 +; RV64IB-NEXT: slli a1, a1, 2 +; RV64IB-NEXT: lui a2, 838861 +; RV64IB-NEXT: addiw a2, a2, -820 +; RV64IB-NEXT: and a1, a1, a2 +; RV64IB-NEXT: srli a2, a0, 2 +; RV64IB-NEXT: lui a3, 209715 +; RV64IB-NEXT: addiw a3, a3, 819 +; RV64IB-NEXT: and a2, a2, a3 +; RV64IB-NEXT: or a0, a2, a0 +; RV64IB-NEXT: or a0, a0, a1 +; RV64IB-NEXT: sext.w a0, a0 ; RV64IB-NEXT: ret ; ; RV64IBP-LABEL: gorc2b_i32: ; RV64IBP: # %bb.0: +; RV64IBP-NEXT: srliw a1, a0, 2 +; RV64IBP-NEXT: or a1, a1, a0 ; RV64IBP-NEXT: gorciw a0, a0, 2 +; RV64IBP-NEXT: slli a1, a1, 2 +; RV64IBP-NEXT: lui a2, 838861 +; RV64IBP-NEXT: addiw a2, a2, -820 +; RV64IBP-NEXT: and a1, a1, a2 +; RV64IBP-NEXT: srli a2, a0, 2 +; RV64IBP-NEXT: lui a3, 209715 +; RV64IBP-NEXT: addiw a3, a3, 819 +; RV64IBP-NEXT: and a2, a2, a3 +; RV64IBP-NEXT: or a0, a2, a0 +; RV64IBP-NEXT: or a0, a0, a1 +; RV64IBP-NEXT: sext.w a0, a0 ; RV64IBP-NEXT: ret %and1 = shl i32 %a, 2 %shl1 = and i32 %and1, -858993460 @@ -1059,23 +1085,71 @@ ; RV64I-NEXT: addi a4, a4, 819 ; RV64I-NEXT: and a3, a3, a4 ; RV64I-NEXT: or a0, a3, a0 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 2 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: srli a2, a0, 2 +; RV64I-NEXT: or a1, a0, a1 +; RV64I-NEXT: slli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: srli a2, a1, 2 ; RV64I-NEXT: and a2, a2, a4 -; RV64I-NEXT: or a0, a2, a0 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: ret ; ; RV64IB-LABEL: gorc2b_i64: ; RV64IB: # %bb.0: +; RV64IB-NEXT: srli a1, a0, 2 +; RV64IB-NEXT: or a1, a1, a0 ; RV64IB-NEXT: orc2.n a0, a0 +; RV64IB-NEXT: slli a1, a1, 2 +; RV64IB-NEXT: lui a2, 1035469 +; RV64IB-NEXT: addiw a2, a2, -819 +; RV64IB-NEXT: slli a2, a2, 12 +; RV64IB-NEXT: addi a2, a2, -819 +; RV64IB-NEXT: slli a2, a2, 12 +; RV64IB-NEXT: addi a2, a2, -819 +; RV64IB-NEXT: slli a2, a2, 12 +; RV64IB-NEXT: addi a2, a2, -820 +; RV64IB-NEXT: and a1, a1, a2 +; RV64IB-NEXT: srli a2, a0, 2 +; RV64IB-NEXT: lui a3, 13107 +; RV64IB-NEXT: addiw a3, a3, 819 +; RV64IB-NEXT: slli a3, a3, 12 +; RV64IB-NEXT: addi a3, a3, 819 +; RV64IB-NEXT: slli a3, a3, 12 +; RV64IB-NEXT: addi a3, a3, 819 +; RV64IB-NEXT: slli a3, a3, 12 +; RV64IB-NEXT: addi a3, a3, 819 +; RV64IB-NEXT: and a2, a2, a3 +; RV64IB-NEXT: or a0, a2, a0 +; RV64IB-NEXT: or a0, a0, a1 ; RV64IB-NEXT: ret ; ; RV64IBP-LABEL: gorc2b_i64: ; RV64IBP: # %bb.0: +; RV64IBP-NEXT: srli a1, a0, 2 +; RV64IBP-NEXT: or a1, a1, a0 ; RV64IBP-NEXT: orc2.n a0, a0 +; RV64IBP-NEXT: slli a1, a1, 2 +; RV64IBP-NEXT: lui a2, 1035469 +; RV64IBP-NEXT: addiw a2, a2, -819 +; RV64IBP-NEXT: slli a2, a2, 12 +; RV64IBP-NEXT: addi a2, a2, -819 +; RV64IBP-NEXT: slli a2, a2, 12 +; RV64IBP-NEXT: addi a2, a2, -819 +; RV64IBP-NEXT: slli a2, a2, 12 +; RV64IBP-NEXT: addi a2, a2, -820 +; RV64IBP-NEXT: and a1, a1, a2 +; RV64IBP-NEXT: srli a2, a0, 2 +; RV64IBP-NEXT: lui a3, 13107 +; RV64IBP-NEXT: addiw a3, a3, 819 +; RV64IBP-NEXT: slli a3, a3, 12 +; RV64IBP-NEXT: addi a3, a3, 819 +; RV64IBP-NEXT: slli a3, a3, 12 +; RV64IBP-NEXT: addi a3, a3, 819 +; RV64IBP-NEXT: slli a3, a3, 12 +; RV64IBP-NEXT: addi a3, a3, 819 +; RV64IBP-NEXT: and a2, a2, a3 +; RV64IBP-NEXT: or a0, a2, a0 +; RV64IBP-NEXT: or a0, a0, a1 ; RV64IBP-NEXT: ret %and1 = shl i64 %a, 2 %shl1 = and i64 %and1, -3689348814741910324 @@ -2693,7 +2767,7 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind { ; RV64I-LABEL: bswap_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a1, a0, 8 +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 @@ -2725,7 +2799,7 @@ define void @bswap_i32_nosext(i32 signext %a, i32* %x) nounwind { ; RV64I-LABEL: bswap_i32_nosext: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: srli a2, a0, 8 ; RV64I-NEXT: lui a3, 16 ; RV64I-NEXT: addiw a3, a3, -256 ; RV64I-NEXT: and a2, a2, a3 @@ -2894,7 +2968,7 @@ define signext i32 @bitreverse_i32(i32 signext %a) nounwind { ; RV64I-LABEL: bitreverse_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a1, a0, 8 +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 @@ -2955,7 +3029,7 @@ define void @bitreverse_i32_nosext(i32 signext %a, i32* %x) nounwind { ; RV64I-LABEL: bitreverse_i32_nosext: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: srli a2, a0, 8 ; RV64I-NEXT: lui a3, 16 ; RV64I-NEXT: addiw a3, a3, -256 ; RV64I-NEXT: and a2, a2, a3 @@ -3130,7 +3204,7 @@ define i32 @bswap_rotr_i32(i32 %a) { ; RV64I-LABEL: bswap_rotr_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a1, a0, 8 +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 @@ -3164,7 +3238,7 @@ define i32 @bswap_rotl_i32(i32 %a) { ; RV64I-LABEL: bswap_rotl_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a1, a0, 8 +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 @@ -3198,7 +3272,7 @@ define i32 @bitreverse_bswap_i32(i32 %a) { ; RV64I-LABEL: bitreverse_bswap_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a1, a0, 8 +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll @@ -360,7 +360,7 @@ ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX2-RV64-NEXT: vle32.v v25, (a0) ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 -; LMULMAX2-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 ; LMULMAX2-RV64-NEXT: lui a3, 16 ; LMULMAX2-RV64-NEXT: addiw a3, a3, -256 ; LMULMAX2-RV64-NEXT: and a2, a2, a3 @@ -376,7 +376,7 @@ ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 ; LMULMAX2-RV64-NEXT: and a2, a2, a3 ; LMULMAX2-RV64-NEXT: srliw a4, a1, 24 ; LMULMAX2-RV64-NEXT: or a2, a2, a4 @@ -388,7 +388,7 @@ ; LMULMAX2-RV64-NEXT: sw a1, 28(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 ; LMULMAX2-RV64-NEXT: and a2, a2, a3 ; LMULMAX2-RV64-NEXT: srliw a4, a1, 24 ; LMULMAX2-RV64-NEXT: or a2, a2, a4 @@ -400,7 +400,7 @@ ; LMULMAX2-RV64-NEXT: sw a1, 24(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 -; LMULMAX2-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX2-RV64-NEXT: srli a2, a1, 8 ; LMULMAX2-RV64-NEXT: and a2, a2, a3 ; LMULMAX2-RV64-NEXT: srliw a3, a1, 24 ; LMULMAX2-RV64-NEXT: or a2, a2, a3 @@ -488,7 +488,7 @@ ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 -; LMULMAX1-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX1-RV64-NEXT: srli a2, a1, 8 ; LMULMAX1-RV64-NEXT: lui a3, 16 ; LMULMAX1-RV64-NEXT: addiw a3, a3, -256 ; LMULMAX1-RV64-NEXT: and a2, a2, a3 @@ -504,7 +504,7 @@ ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX1-RV64-NEXT: srli a2, a1, 8 ; LMULMAX1-RV64-NEXT: and a2, a2, a3 ; LMULMAX1-RV64-NEXT: srliw a4, a1, 24 ; LMULMAX1-RV64-NEXT: or a2, a2, a4 @@ -516,7 +516,7 @@ ; LMULMAX1-RV64-NEXT: sw a1, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX1-RV64-NEXT: srli a2, a1, 8 ; LMULMAX1-RV64-NEXT: and a2, a2, a3 ; LMULMAX1-RV64-NEXT: srliw a4, a1, 24 ; LMULMAX1-RV64-NEXT: or a2, a2, a4 @@ -528,7 +528,7 @@ ; LMULMAX1-RV64-NEXT: sw a1, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 -; LMULMAX1-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX1-RV64-NEXT: srli a2, a1, 8 ; LMULMAX1-RV64-NEXT: and a2, a2, a3 ; LMULMAX1-RV64-NEXT: srliw a3, a1, 24 ; LMULMAX1-RV64-NEXT: or a2, a2, a3 @@ -1497,7 +1497,7 @@ ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-RV64-NEXT: vle32.v v26, (a0) ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v26 -; LMULMAX2-RV64-NEXT: srliw a2, a3, 8 +; LMULMAX2-RV64-NEXT: srli a2, a3, 8 ; LMULMAX2-RV64-NEXT: lui a1, 16 ; LMULMAX2-RV64-NEXT: addiw a1, a1, -256 ; LMULMAX2-RV64-NEXT: and a2, a2, a1 @@ -1513,7 +1513,7 @@ ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 -; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 +; LMULMAX2-RV64-NEXT: srli a4, a3, 8 ; LMULMAX2-RV64-NEXT: and a4, a4, a1 ; LMULMAX2-RV64-NEXT: srliw a5, a3, 24 ; LMULMAX2-RV64-NEXT: or a4, a4, a5 @@ -1525,7 +1525,7 @@ ; LMULMAX2-RV64-NEXT: sw a3, 60(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 -; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 +; LMULMAX2-RV64-NEXT: srli a4, a3, 8 ; LMULMAX2-RV64-NEXT: and a4, a4, a1 ; LMULMAX2-RV64-NEXT: srliw a5, a3, 24 ; LMULMAX2-RV64-NEXT: or a4, a4, a5 @@ -1537,7 +1537,7 @@ ; LMULMAX2-RV64-NEXT: sw a3, 56(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 -; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 +; LMULMAX2-RV64-NEXT: srli a4, a3, 8 ; LMULMAX2-RV64-NEXT: and a4, a4, a1 ; LMULMAX2-RV64-NEXT: srliw a5, a3, 24 ; LMULMAX2-RV64-NEXT: or a4, a4, a5 @@ -1549,7 +1549,7 @@ ; LMULMAX2-RV64-NEXT: sw a3, 52(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 -; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 +; LMULMAX2-RV64-NEXT: srli a4, a3, 8 ; LMULMAX2-RV64-NEXT: and a4, a4, a1 ; LMULMAX2-RV64-NEXT: srliw a5, a3, 24 ; LMULMAX2-RV64-NEXT: or a4, a4, a5 @@ -1561,7 +1561,7 @@ ; LMULMAX2-RV64-NEXT: sw a3, 48(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 -; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 +; LMULMAX2-RV64-NEXT: srli a4, a3, 8 ; LMULMAX2-RV64-NEXT: and a4, a4, a1 ; LMULMAX2-RV64-NEXT: srliw a5, a3, 24 ; LMULMAX2-RV64-NEXT: or a4, a4, a5 @@ -1573,7 +1573,7 @@ ; LMULMAX2-RV64-NEXT: sw a3, 44(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 -; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 +; LMULMAX2-RV64-NEXT: srli a4, a3, 8 ; LMULMAX2-RV64-NEXT: and a4, a4, a1 ; LMULMAX2-RV64-NEXT: srliw a5, a3, 24 ; LMULMAX2-RV64-NEXT: or a4, a4, a5 @@ -1585,7 +1585,7 @@ ; LMULMAX2-RV64-NEXT: sw a3, 40(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v26 -; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 +; LMULMAX2-RV64-NEXT: srli a4, a3, 8 ; LMULMAX2-RV64-NEXT: and a1, a4, a1 ; LMULMAX2-RV64-NEXT: srliw a4, a3, 24 ; LMULMAX2-RV64-NEXT: or a1, a1, a4 @@ -1730,7 +1730,7 @@ ; LMULMAX1-RV64-NEXT: vle32.v v26, (a6) ; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) ; LMULMAX1-RV64-NEXT: vmv.x.s a4, v26 -; LMULMAX1-RV64-NEXT: srliw a3, a4, 8 +; LMULMAX1-RV64-NEXT: srli a3, a4, 8 ; LMULMAX1-RV64-NEXT: lui a2, 16 ; LMULMAX1-RV64-NEXT: addiw a2, a2, -256 ; LMULMAX1-RV64-NEXT: and a3, a3, a2 @@ -1746,7 +1746,7 @@ ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 -; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 +; LMULMAX1-RV64-NEXT: srli a4, a1, 8 ; LMULMAX1-RV64-NEXT: and a4, a4, a2 ; LMULMAX1-RV64-NEXT: srliw a5, a1, 24 ; LMULMAX1-RV64-NEXT: or a4, a4, a5 @@ -1758,7 +1758,7 @@ ; LMULMAX1-RV64-NEXT: sw a1, 44(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 -; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 +; LMULMAX1-RV64-NEXT: srli a4, a1, 8 ; LMULMAX1-RV64-NEXT: and a4, a4, a2 ; LMULMAX1-RV64-NEXT: srliw a5, a1, 24 ; LMULMAX1-RV64-NEXT: or a4, a4, a5 @@ -1770,7 +1770,7 @@ ; LMULMAX1-RV64-NEXT: sw a1, 40(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 +; LMULMAX1-RV64-NEXT: srli a4, a1, 8 ; LMULMAX1-RV64-NEXT: and a4, a4, a2 ; LMULMAX1-RV64-NEXT: srliw a5, a1, 24 ; LMULMAX1-RV64-NEXT: or a4, a4, a5 @@ -1781,7 +1781,7 @@ ; LMULMAX1-RV64-NEXT: or a1, a1, a4 ; LMULMAX1-RV64-NEXT: sw a1, 36(sp) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 -; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 +; LMULMAX1-RV64-NEXT: srli a4, a1, 8 ; LMULMAX1-RV64-NEXT: and a4, a4, a2 ; LMULMAX1-RV64-NEXT: srliw a5, a1, 24 ; LMULMAX1-RV64-NEXT: or a4, a4, a5 @@ -1793,7 +1793,7 @@ ; LMULMAX1-RV64-NEXT: sw a1, 16(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 +; LMULMAX1-RV64-NEXT: srli a4, a1, 8 ; LMULMAX1-RV64-NEXT: and a4, a4, a2 ; LMULMAX1-RV64-NEXT: srliw a5, a1, 24 ; LMULMAX1-RV64-NEXT: or a4, a4, a5 @@ -1805,7 +1805,7 @@ ; LMULMAX1-RV64-NEXT: sw a1, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 +; LMULMAX1-RV64-NEXT: srli a4, a1, 8 ; LMULMAX1-RV64-NEXT: and a4, a4, a2 ; LMULMAX1-RV64-NEXT: srliw a5, a1, 24 ; LMULMAX1-RV64-NEXT: or a4, a4, a5 @@ -1817,7 +1817,7 @@ ; LMULMAX1-RV64-NEXT: sw a1, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 -; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 +; LMULMAX1-RV64-NEXT: srli a4, a1, 8 ; LMULMAX1-RV64-NEXT: and a2, a4, a2 ; LMULMAX1-RV64-NEXT: srliw a4, a1, 24 ; LMULMAX1-RV64-NEXT: or a2, a2, a4 diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -321,7 +321,7 @@ ; RV32-NEXT: sw s6, 0(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 ; RV32-NEXT: lw a0, 4(a0) -; RV32-NEXT: lbu a1, 12(s0) +; RV32-NEXT: lb a1, 12(s0) ; RV32-NEXT: lw a2, 8(s0) ; RV32-NEXT: andi a3, a0, 1 ; RV32-NEXT: neg s2, a3 @@ -394,34 +394,30 @@ ; ; RV64-LABEL: test_srem_vec: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -64 -; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s1, 40(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s2, 32(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s3, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s4, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s5, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -48 +; RV64-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s3, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s4, 0(sp) # 8-byte Folded Spill ; RV64-NEXT: mv s0, a0 ; RV64-NEXT: lb a0, 12(a0) ; RV64-NEXT: lwu a1, 8(s0) ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: or a0, a1, a0 -; RV64-NEXT: addi s4, zero, -1 -; RV64-NEXT: srli a1, s4, 24 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: ld a1, 0(s0) -; RV64-NEXT: slli a2, a0, 29 -; RV64-NEXT: srai s1, a2, 31 -; RV64-NEXT: slli a0, a0, 31 -; RV64-NEXT: srli a2, a1, 33 -; RV64-NEXT: or a0, a2, a0 +; RV64-NEXT: ld a2, 0(s0) +; RV64-NEXT: slli a0, a0, 29 +; RV64-NEXT: srai s1, a0, 31 +; RV64-NEXT: slli a0, a1, 31 +; RV64-NEXT: srli a1, a2, 33 +; RV64-NEXT: or a0, a1, a0 ; RV64-NEXT: slli a0, a0, 31 ; RV64-NEXT: srai a0, a0, 31 -; RV64-NEXT: slli a1, a1, 31 +; RV64-NEXT: slli a1, a2, 31 ; RV64-NEXT: srai s2, a1, 31 ; RV64-NEXT: addi a1, zero, 7 -; RV64-NEXT: addi s5, zero, 7 +; RV64-NEXT: addi s4, zero, 7 ; RV64-NEXT: call __moddi3@plt ; RV64-NEXT: mv s3, a0 ; RV64-NEXT: addi a1, zero, -5 @@ -456,30 +452,32 @@ ; RV64-NEXT: addi a2, s3, -1 ; RV64-NEXT: snez a2, a2 ; RV64-NEXT: neg a0, a0 -; RV64-NEXT: neg a2, a2 -; RV64-NEXT: neg a3, a1 -; RV64-NEXT: slli a4, s5, 32 -; RV64-NEXT: and a3, a3, a4 -; RV64-NEXT: srli a3, a3, 32 -; RV64-NEXT: sb a3, 12(s0) +; RV64-NEXT: neg a3, a2 +; RV64-NEXT: neg a4, a1 +; RV64-NEXT: slli a5, s4, 32 +; RV64-NEXT: and a4, a4, a5 +; RV64-NEXT: srli a4, a4, 32 +; RV64-NEXT: sb a4, 12(s0) ; RV64-NEXT: slli a1, a1, 2 -; RV64-NEXT: srli a3, s4, 31 -; RV64-NEXT: and a2, a2, a3 -; RV64-NEXT: srli a4, a2, 31 -; RV64-NEXT: sub a1, a4, a1 +; RV64-NEXT: addi a4, zero, 3 +; RV64-NEXT: slli a4, a4, 31 +; RV64-NEXT: and a3, a3, a4 +; RV64-NEXT: srli a3, a3, 31 +; RV64-NEXT: sub a1, a3, a1 ; RV64-NEXT: sw a1, 8(s0) -; RV64-NEXT: and a0, a0, a3 +; RV64-NEXT: addi a1, zero, -1 +; RV64-NEXT: srli a1, a1, 31 +; RV64-NEXT: and a0, a0, a1 ; RV64-NEXT: slli a1, a2, 33 -; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: sd a0, 0(s0) -; RV64-NEXT: ld s5, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s4, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s3, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s2, 32(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s1, 40(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: ld s4, 0(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s3, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 48 ; RV64-NEXT: ret ; ; RV32M-LABEL: test_srem_vec: @@ -495,7 +493,7 @@ ; RV32M-NEXT: sw s6, 0(sp) # 4-byte Folded Spill ; RV32M-NEXT: mv s0, a0 ; RV32M-NEXT: lw a0, 4(a0) -; RV32M-NEXT: lbu a1, 12(s0) +; RV32M-NEXT: lb a1, 12(s0) ; RV32M-NEXT: lw a2, 8(s0) ; RV32M-NEXT: andi a3, a0, 1 ; RV32M-NEXT: neg s2, a3 @@ -571,54 +569,51 @@ ; RV64M-NEXT: lb a1, 12(a0) ; RV64M-NEXT: lwu a2, 8(a0) ; RV64M-NEXT: slli a1, a1, 32 -; RV64M-NEXT: or a2, a2, a1 -; RV64M-NEXT: addi a6, zero, -1 -; RV64M-NEXT: srli a3, a6, 24 -; RV64M-NEXT: and a2, a2, a3 +; RV64M-NEXT: or a1, a2, a1 ; RV64M-NEXT: ld a3, 0(a0) -; RV64M-NEXT: slli a4, a2, 29 -; RV64M-NEXT: srai a4, a4, 31 +; RV64M-NEXT: slli a1, a1, 29 +; RV64M-NEXT: srai a1, a1, 31 ; RV64M-NEXT: slli a2, a2, 31 -; RV64M-NEXT: srli a5, a3, 33 -; RV64M-NEXT: or a2, a5, a2 +; RV64M-NEXT: srli a4, a3, 33 +; RV64M-NEXT: or a2, a4, a2 ; RV64M-NEXT: slli a2, a2, 31 ; RV64M-NEXT: srai a2, a2, 31 ; RV64M-NEXT: slli a3, a3, 31 ; RV64M-NEXT: srai a3, a3, 31 -; RV64M-NEXT: lui a5, 18725 -; RV64M-NEXT: addiw a5, a5, -1755 -; RV64M-NEXT: slli a5, a5, 12 -; RV64M-NEXT: addi a5, a5, -1755 -; RV64M-NEXT: slli a5, a5, 12 -; RV64M-NEXT: addi a5, a5, -1755 -; RV64M-NEXT: slli a5, a5, 12 -; RV64M-NEXT: addi a5, a5, -1755 -; RV64M-NEXT: mulh a5, a2, a5 -; RV64M-NEXT: srli a1, a5, 63 -; RV64M-NEXT: srai a5, a5, 1 -; RV64M-NEXT: add a1, a5, a1 -; RV64M-NEXT: slli a5, a1, 3 -; RV64M-NEXT: sub a1, a1, a5 -; RV64M-NEXT: add a1, a2, a1 -; RV64M-NEXT: lui a2, 1035469 -; RV64M-NEXT: addiw a2, a2, -819 -; RV64M-NEXT: slli a2, a2, 12 -; RV64M-NEXT: addi a2, a2, -819 -; RV64M-NEXT: slli a2, a2, 12 -; RV64M-NEXT: addi a2, a2, -819 -; RV64M-NEXT: slli a2, a2, 13 -; RV64M-NEXT: addi a2, a2, -1639 -; RV64M-NEXT: mulh a2, a4, a2 -; RV64M-NEXT: srli a5, a2, 63 -; RV64M-NEXT: srai a2, a2, 1 -; RV64M-NEXT: add a2, a2, a5 -; RV64M-NEXT: slli a5, a2, 2 -; RV64M-NEXT: add a2, a5, a2 -; RV64M-NEXT: add a2, a4, a2 -; RV64M-NEXT: addi a2, a2, -2 +; RV64M-NEXT: lui a4, 18725 +; RV64M-NEXT: addiw a4, a4, -1755 +; RV64M-NEXT: slli a4, a4, 12 +; RV64M-NEXT: addi a4, a4, -1755 +; RV64M-NEXT: slli a4, a4, 12 +; RV64M-NEXT: addi a4, a4, -1755 +; RV64M-NEXT: slli a4, a4, 12 +; RV64M-NEXT: addi a4, a4, -1755 +; RV64M-NEXT: mulh a4, a2, a4 +; RV64M-NEXT: srli a5, a4, 63 +; RV64M-NEXT: srai a4, a4, 1 +; RV64M-NEXT: add a4, a4, a5 +; RV64M-NEXT: slli a5, a4, 3 +; RV64M-NEXT: sub a4, a4, a5 +; RV64M-NEXT: add a2, a2, a4 +; RV64M-NEXT: lui a4, 1035469 +; RV64M-NEXT: addiw a4, a4, -819 +; RV64M-NEXT: slli a4, a4, 12 +; RV64M-NEXT: addi a4, a4, -819 +; RV64M-NEXT: slli a4, a4, 12 +; RV64M-NEXT: addi a4, a4, -819 +; RV64M-NEXT: slli a4, a4, 13 +; RV64M-NEXT: addi a4, a4, -1639 +; RV64M-NEXT: mulh a4, a1, a4 +; RV64M-NEXT: srli a5, a4, 63 +; RV64M-NEXT: srai a4, a4, 1 +; RV64M-NEXT: add a4, a4, a5 +; RV64M-NEXT: slli a5, a4, 2 +; RV64M-NEXT: add a4, a5, a4 +; RV64M-NEXT: add a1, a1, a4 +; RV64M-NEXT: addi a1, a1, -2 +; RV64M-NEXT: snez a6, a1 +; RV64M-NEXT: addi a2, a2, -1 ; RV64M-NEXT: snez a2, a2 -; RV64M-NEXT: addi a1, a1, -1 -; RV64M-NEXT: snez a1, a1 ; RV64M-NEXT: lui a4, 1026731 ; RV64M-NEXT: addiw a4, a4, -1365 ; RV64M-NEXT: slli a4, a4, 12 @@ -641,24 +636,27 @@ ; RV64M-NEXT: srli a3, a3, 1 ; RV64M-NEXT: or a3, a3, a5 ; RV64M-NEXT: sltu a3, a4, a3 -; RV64M-NEXT: neg a1, a1 ; RV64M-NEXT: neg a4, a2 +; RV64M-NEXT: neg a5, a6 ; RV64M-NEXT: neg a3, a3 -; RV64M-NEXT: addi a5, zero, 7 -; RV64M-NEXT: slli a5, a5, 32 -; RV64M-NEXT: and a4, a4, a5 -; RV64M-NEXT: srli a4, a4, 32 -; RV64M-NEXT: sb a4, 12(a0) -; RV64M-NEXT: slli a2, a2, 2 -; RV64M-NEXT: srli a4, a6, 31 -; RV64M-NEXT: and a1, a1, a4 -; RV64M-NEXT: srli a5, a1, 31 -; RV64M-NEXT: sub a2, a5, a2 -; RV64M-NEXT: sw a2, 8(a0) -; RV64M-NEXT: slli a1, a1, 33 -; RV64M-NEXT: and a2, a3, a4 -; RV64M-NEXT: or a1, a2, a1 +; RV64M-NEXT: slli a2, a2, 33 +; RV64M-NEXT: addi a1, zero, -1 +; RV64M-NEXT: srli a1, a1, 31 +; RV64M-NEXT: and a1, a3, a1 +; RV64M-NEXT: sub a1, a1, a2 ; RV64M-NEXT: sd a1, 0(a0) +; RV64M-NEXT: addi a1, zero, 7 +; RV64M-NEXT: slli a1, a1, 32 +; RV64M-NEXT: and a1, a5, a1 +; RV64M-NEXT: srli a1, a1, 32 +; RV64M-NEXT: sb a1, 12(a0) +; RV64M-NEXT: slli a1, a6, 2 +; RV64M-NEXT: addi a2, zero, 3 +; RV64M-NEXT: slli a2, a2, 31 +; RV64M-NEXT: and a2, a4, a2 +; RV64M-NEXT: srli a2, a2, 31 +; RV64M-NEXT: sub a1, a2, a1 +; RV64M-NEXT: sw a1, 8(a0) ; RV64M-NEXT: ret ; ; RV32MV-LABEL: test_srem_vec: @@ -679,7 +677,7 @@ ; RV32MV-NEXT: slli a2, a0, 31 ; RV32MV-NEXT: srli a3, a1, 1 ; RV32MV-NEXT: or s2, a3, a2 -; RV32MV-NEXT: lbu a2, 12(s1) +; RV32MV-NEXT: lb a2, 12(s1) ; RV32MV-NEXT: srli a3, a0, 1 ; RV32MV-NEXT: andi a3, a3, 1 ; RV32MV-NEXT: neg s3, a3 @@ -774,53 +772,50 @@ ; RV64MV-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; RV64MV-NEXT: addi s0, sp, 96 ; RV64MV-NEXT: andi sp, sp, -32 -; RV64MV-NEXT: lb a1, 12(a0) -; RV64MV-NEXT: lwu a2, 8(a0) -; RV64MV-NEXT: slli a1, a1, 32 -; RV64MV-NEXT: or a2, a2, a1 -; RV64MV-NEXT: addi a6, zero, -1 -; RV64MV-NEXT: ld a3, 0(a0) -; RV64MV-NEXT: srli a4, a6, 24 -; RV64MV-NEXT: and a2, a2, a4 -; RV64MV-NEXT: slli a4, a2, 31 -; RV64MV-NEXT: srli a5, a3, 33 -; RV64MV-NEXT: or a4, a5, a4 -; RV64MV-NEXT: slli a4, a4, 31 -; RV64MV-NEXT: srai a4, a4, 31 -; RV64MV-NEXT: slli a2, a2, 29 -; RV64MV-NEXT: srai a2, a2, 31 +; RV64MV-NEXT: lwu a1, 8(a0) +; RV64MV-NEXT: ld a2, 0(a0) +; RV64MV-NEXT: slli a3, a1, 31 +; RV64MV-NEXT: srli a4, a2, 33 +; RV64MV-NEXT: lb a5, 12(a0) +; RV64MV-NEXT: or a3, a4, a3 ; RV64MV-NEXT: slli a3, a3, 31 ; RV64MV-NEXT: srai a3, a3, 31 -; RV64MV-NEXT: lui a5, 10923 -; RV64MV-NEXT: addiw a5, a5, -1365 -; RV64MV-NEXT: slli a5, a5, 12 -; RV64MV-NEXT: addi a5, a5, -1365 -; RV64MV-NEXT: slli a5, a5, 12 -; RV64MV-NEXT: addi a5, a5, -1365 -; RV64MV-NEXT: slli a5, a5, 12 -; RV64MV-NEXT: addi a5, a5, -1365 -; RV64MV-NEXT: mulh a5, a3, a5 -; RV64MV-NEXT: srli a1, a5, 63 -; RV64MV-NEXT: add a1, a5, a1 +; RV64MV-NEXT: slli a4, a5, 32 +; RV64MV-NEXT: or a1, a1, a4 +; RV64MV-NEXT: slli a1, a1, 29 +; RV64MV-NEXT: srai a1, a1, 31 +; RV64MV-NEXT: slli a2, a2, 31 +; RV64MV-NEXT: srai a2, a2, 31 +; RV64MV-NEXT: lui a4, 10923 +; RV64MV-NEXT: addiw a4, a4, -1365 +; RV64MV-NEXT: slli a4, a4, 12 +; RV64MV-NEXT: addi a4, a4, -1365 +; RV64MV-NEXT: slli a4, a4, 12 +; RV64MV-NEXT: addi a4, a4, -1365 +; RV64MV-NEXT: slli a4, a4, 12 +; RV64MV-NEXT: addi a4, a4, -1365 +; RV64MV-NEXT: mulh a4, a2, a4 +; RV64MV-NEXT: srli a5, a4, 63 +; RV64MV-NEXT: add a4, a4, a5 ; RV64MV-NEXT: addi a5, zero, 6 -; RV64MV-NEXT: mul a1, a1, a5 -; RV64MV-NEXT: sub a1, a3, a1 -; RV64MV-NEXT: sd a1, 32(sp) -; RV64MV-NEXT: lui a1, 1035469 -; RV64MV-NEXT: addiw a1, a1, -819 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, -819 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, -819 -; RV64MV-NEXT: slli a1, a1, 13 -; RV64MV-NEXT: addi a1, a1, -1639 -; RV64MV-NEXT: mulh a1, a2, a1 -; RV64MV-NEXT: srli a3, a1, 63 -; RV64MV-NEXT: srai a1, a1, 1 -; RV64MV-NEXT: add a1, a1, a3 -; RV64MV-NEXT: slli a3, a1, 2 -; RV64MV-NEXT: add a1, a3, a1 -; RV64MV-NEXT: add a1, a2, a1 +; RV64MV-NEXT: mul a4, a4, a5 +; RV64MV-NEXT: sub a2, a2, a4 +; RV64MV-NEXT: sd a2, 32(sp) +; RV64MV-NEXT: lui a2, 1035469 +; RV64MV-NEXT: addiw a2, a2, -819 +; RV64MV-NEXT: slli a2, a2, 12 +; RV64MV-NEXT: addi a2, a2, -819 +; RV64MV-NEXT: slli a2, a2, 12 +; RV64MV-NEXT: addi a2, a2, -819 +; RV64MV-NEXT: slli a2, a2, 13 +; RV64MV-NEXT: addi a2, a2, -1639 +; RV64MV-NEXT: mulh a2, a1, a2 +; RV64MV-NEXT: srli a4, a2, 63 +; RV64MV-NEXT: srai a2, a2, 1 +; RV64MV-NEXT: add a2, a2, a4 +; RV64MV-NEXT: slli a4, a2, 2 +; RV64MV-NEXT: add a2, a4, a2 +; RV64MV-NEXT: add a1, a1, a2 ; RV64MV-NEXT: sd a1, 48(sp) ; RV64MV-NEXT: lui a1, 18725 ; RV64MV-NEXT: addiw a1, a1, -1755 @@ -830,13 +825,13 @@ ; RV64MV-NEXT: addi a1, a1, -1755 ; RV64MV-NEXT: slli a1, a1, 12 ; RV64MV-NEXT: addi a1, a1, -1755 -; RV64MV-NEXT: mulh a1, a4, a1 +; RV64MV-NEXT: mulh a1, a3, a1 ; RV64MV-NEXT: srli a2, a1, 63 ; RV64MV-NEXT: srai a1, a1, 1 ; RV64MV-NEXT: add a1, a1, a2 ; RV64MV-NEXT: slli a2, a1, 3 ; RV64MV-NEXT: sub a1, a1, a2 -; RV64MV-NEXT: add a1, a4, a1 +; RV64MV-NEXT: add a1, a3, a1 ; RV64MV-NEXT: sd a1, 40(sp) ; RV64MV-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64MV-NEXT: addi a1, sp, 32 @@ -844,7 +839,8 @@ ; RV64MV-NEXT: lui a1, %hi(.LCPI3_0) ; RV64MV-NEXT: addi a1, a1, %lo(.LCPI3_0) ; RV64MV-NEXT: vle64.v v28, (a1) -; RV64MV-NEXT: srli a1, a6, 31 +; RV64MV-NEXT: addi a1, zero, -1 +; RV64MV-NEXT: srli a1, a1, 31 ; RV64MV-NEXT: vand.vx v26, v26, a1 ; RV64MV-NEXT: vmsne.vv v0, v26, v28 ; RV64MV-NEXT: vmv.v.i v26, 0 @@ -855,18 +851,20 @@ ; RV64MV-NEXT: srli a3, a2, 30 ; RV64MV-NEXT: andi a3, a3, 7 ; RV64MV-NEXT: sb a3, 12(a0) -; RV64MV-NEXT: slli a2, a2, 2 -; RV64MV-NEXT: vslidedown.vi v28, v26, 1 -; RV64MV-NEXT: vmv.x.s a3, v28 -; RV64MV-NEXT: and a3, a3, a1 -; RV64MV-NEXT: srli a4, a3, 31 -; RV64MV-NEXT: or a2, a4, a2 -; RV64MV-NEXT: sw a2, 8(a0) -; RV64MV-NEXT: vmv.x.s a2, v26 -; RV64MV-NEXT: and a1, a2, a1 -; RV64MV-NEXT: slli a2, a3, 33 -; RV64MV-NEXT: or a1, a1, a2 +; RV64MV-NEXT: vmv.x.s a3, v26 +; RV64MV-NEXT: and a1, a3, a1 +; RV64MV-NEXT: vslidedown.vi v26, v26, 1 +; RV64MV-NEXT: vmv.x.s a3, v26 +; RV64MV-NEXT: slli a4, a3, 33 +; RV64MV-NEXT: or a1, a1, a4 ; RV64MV-NEXT: sd a1, 0(a0) +; RV64MV-NEXT: slli a1, a2, 2 +; RV64MV-NEXT: addi a2, zero, 3 +; RV64MV-NEXT: slli a2, a2, 31 +; RV64MV-NEXT: and a2, a3, a2 +; RV64MV-NEXT: srli a2, a2, 31 +; RV64MV-NEXT: or a1, a2, a1 +; RV64MV-NEXT: sw a1, 8(a0) ; RV64MV-NEXT: addi sp, s0, -96 ; RV64MV-NEXT: ld s0, 80(sp) # 8-byte Folded Reload ; RV64MV-NEXT: ld ra, 88(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/vec3-setcc-crash.ll b/llvm/test/CodeGen/RISCV/vec3-setcc-crash.ll --- a/llvm/test/CodeGen/RISCV/vec3-setcc-crash.ll +++ b/llvm/test/CodeGen/RISCV/vec3-setcc-crash.ll @@ -12,30 +12,27 @@ ; RV32-LABEL: vec3_setcc_crash: ; RV32: # %bb.0: ; RV32-NEXT: lw a0, 0(a0) -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: and a2, a0, a2 -; RV32-NEXT: slli a3, a2, 16 -; RV32-NEXT: srai a6, a3, 24 -; RV32-NEXT: slli a4, a0, 24 -; RV32-NEXT: srai a3, a4, 24 -; RV32-NEXT: slli a4, a0, 8 -; RV32-NEXT: mv a5, a0 -; RV32-NEXT: bgtz a3, .LBB0_2 +; RV32-NEXT: slli a2, a0, 8 +; RV32-NEXT: slli a3, a0, 24 +; RV32-NEXT: slli a4, a0, 16 +; RV32-NEXT: srai a5, a4, 24 +; RV32-NEXT: srai a3, a3, 24 +; RV32-NEXT: bgtz a5, .LBB0_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a5, zero +; RV32-NEXT: j .LBB0_3 ; RV32-NEXT: .LBB0_2: -; RV32-NEXT: srai a4, a4, 24 -; RV32-NEXT: andi a5, a5, 255 -; RV32-NEXT: bgtz a6, .LBB0_4 -; RV32-NEXT: # %bb.3: -; RV32-NEXT: mv a2, zero -; RV32-NEXT: j .LBB0_5 -; RV32-NEXT: .LBB0_4: -; RV32-NEXT: srli a2, a2, 8 +; RV32-NEXT: srli a5, a4, 24 +; RV32-NEXT: .LBB0_3: +; RV32-NEXT: srai a4, a2, 24 +; RV32-NEXT: slli a2, a5, 8 +; RV32-NEXT: mv a5, a0 +; RV32-NEXT: bgtz a3, .LBB0_5 +; RV32-NEXT: # %bb.4: +; RV32-NEXT: mv a5, zero ; RV32-NEXT: .LBB0_5: -; RV32-NEXT: slli a2, a2, 8 -; RV32-NEXT: or a2, a5, a2 +; RV32-NEXT: andi a3, a5, 255 +; RV32-NEXT: or a2, a3, a2 ; RV32-NEXT: bgtz a4, .LBB0_7 ; RV32-NEXT: # %bb.6: ; RV32-NEXT: mv a0, zero @@ -50,30 +47,27 @@ ; RV64-LABEL: vec3_setcc_crash: ; RV64: # %bb.0: ; RV64-NEXT: lwu a0, 0(a0) -; RV64-NEXT: lui a2, 16 -; RV64-NEXT: addiw a2, a2, -256 -; RV64-NEXT: and a2, a0, a2 -; RV64-NEXT: slli a3, a2, 48 -; RV64-NEXT: srai a6, a3, 56 -; RV64-NEXT: slli a4, a0, 56 -; RV64-NEXT: srai a3, a4, 56 -; RV64-NEXT: slli a4, a0, 40 -; RV64-NEXT: mv a5, a0 -; RV64-NEXT: bgtz a3, .LBB0_2 +; RV64-NEXT: slli a2, a0, 40 +; RV64-NEXT: slli a3, a0, 56 +; RV64-NEXT: slli a4, a0, 48 +; RV64-NEXT: srai a5, a4, 56 +; RV64-NEXT: srai a3, a3, 56 +; RV64-NEXT: bgtz a5, .LBB0_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a5, zero +; RV64-NEXT: j .LBB0_3 ; RV64-NEXT: .LBB0_2: -; RV64-NEXT: srai a4, a4, 56 -; RV64-NEXT: andi a5, a5, 255 -; RV64-NEXT: bgtz a6, .LBB0_4 -; RV64-NEXT: # %bb.3: -; RV64-NEXT: mv a2, zero -; RV64-NEXT: j .LBB0_5 -; RV64-NEXT: .LBB0_4: -; RV64-NEXT: srli a2, a2, 8 +; RV64-NEXT: srli a5, a4, 56 +; RV64-NEXT: .LBB0_3: +; RV64-NEXT: srai a4, a2, 56 +; RV64-NEXT: slli a2, a5, 8 +; RV64-NEXT: mv a5, a0 +; RV64-NEXT: bgtz a3, .LBB0_5 +; RV64-NEXT: # %bb.4: +; RV64-NEXT: mv a5, zero ; RV64-NEXT: .LBB0_5: -; RV64-NEXT: slli a2, a2, 8 -; RV64-NEXT: or a2, a5, a2 +; RV64-NEXT: andi a3, a5, 255 +; RV64-NEXT: or a2, a3, a2 ; RV64-NEXT: bgtz a4, .LBB0_7 ; RV64-NEXT: # %bb.6: ; RV64-NEXT: mv a0, zero diff --git a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll --- a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll +++ b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: nihh %r1, 4095 ; CHECK-NEXT: stg %r1, 0(%r2) ; CHECK-NEXT: vlgvf %r1, %v24, 2 -; CHECK-NEXT: risbgn %r0, %r0, 0, 129, 62 +; CHECK-NEXT: sllg %r0, %r0, 62 ; CHECK-NEXT: rosbg %r0, %r1, 2, 32, 31 ; CHECK-NEXT: vlgvf %r1, %v24, 3 ; CHECK-NEXT: rosbg %r0, %r1, 33, 63, 0 @@ -76,38 +76,41 @@ ; CHECK-NEXT: stmg %r14, %r15, 112(%r15) ; CHECK-NEXT: .cfi_offset %r14, -48 ; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: vlgvf %r0, %v26, 3 -; CHECK-NEXT: vlgvf %r4, %v24, 1 -; CHECK-NEXT: vlgvf %r3, %v24, 2 -; CHECK-NEXT: srlk %r1, %r0, 8 +; CHECK-NEXT: vlgvf %r1, %v26, 3 +; CHECK-NEXT: vlgvf %r0, %v26, 2 +; CHECK-NEXT: stc %r1, 30(%r2) +; CHECK-NEXT: srlk %r3, %r1, 8 +; CHECK-NEXT: risbgn %r1, %r1, 33, 167, 0 +; CHECK-NEXT: vlgvf %r5, %v24, 2 +; CHECK-NEXT: rosbg %r1, %r0, 2, 32, 31 +; CHECK-NEXT: sth %r3, 28(%r2) +; CHECK-NEXT: srlg %r1, %r1, 24 +; CHECK-NEXT: vlgvf %r3, %v24, 3 +; CHECK-NEXT: st %r1, 24(%r2) +; CHECK-NEXT: vlgvf %r1, %v26, 0 +; CHECK-NEXT: risbgn %r14, %r5, 6, 164, 27 +; CHECK-NEXT: sllg %r4, %r3, 60 +; CHECK-NEXT: rosbg %r14, %r3, 37, 63, 60 +; CHECK-NEXT: sllg %r3, %r14, 8 +; CHECK-NEXT: rosbg %r4, %r1, 4, 34, 29 +; CHECK-NEXT: rosbg %r3, %r4, 56, 63, 8 +; CHECK-NEXT: stg %r3, 8(%r2) +; CHECK-NEXT: vlgvf %r3, %v24, 1 +; CHECK-NEXT: sllg %r4, %r3, 58 +; CHECK-NEXT: rosbg %r4, %r5, 6, 36, 27 ; CHECK-NEXT: vlgvf %r5, %v24, 0 -; CHECK-NEXT: sth %r1, 28(%r2) -; CHECK-NEXT: risbgn %r1, %r4, 0, 133, 58 ; CHECK-NEXT: sllg %r5, %r5, 25 -; CHECK-NEXT: stc %r0, 30(%r2) -; CHECK-NEXT: rosbg %r1, %r3, 6, 36, 27 -; CHECK-NEXT: vlgvf %r3, %v24, 3 -; CHECK-NEXT: rosbg %r5, %r4, 39, 63, 58 -; CHECK-NEXT: sllg %r4, %r5, 8 -; CHECK-NEXT: rosbg %r1, %r3, 37, 63, 60 -; CHECK-NEXT: vlgvf %r5, %v26, 1 -; CHECK-NEXT: rosbg %r4, %r1, 56, 63, 8 -; CHECK-NEXT: stg %r4, 0(%r2) -; CHECK-NEXT: vlgvf %r4, %v26, 2 -; CHECK-NEXT: risbgn %r14, %r5, 0, 129, 62 -; CHECK-NEXT: risbgn %r3, %r3, 0, 131, 60 -; CHECK-NEXT: rosbg %r14, %r4, 2, 32, 31 -; CHECK-NEXT: rosbg %r14, %r0, 33, 63, 0 -; CHECK-NEXT: srlg %r0, %r14, 24 -; CHECK-NEXT: st %r0, 24(%r2) -; CHECK-NEXT: vlgvf %r0, %v26, 0 -; CHECK-NEXT: rosbg %r3, %r0, 4, 34, 29 -; CHECK-NEXT: sllg %r0, %r1, 8 -; CHECK-NEXT: rosbg %r3, %r5, 35, 63, 62 -; CHECK-NEXT: rosbg %r0, %r3, 56, 63, 8 -; CHECK-NEXT: stg %r0, 8(%r2) -; CHECK-NEXT: sllg %r0, %r3, 8 -; CHECK-NEXT: rosbg %r0, %r14, 56, 63, 8 +; CHECK-NEXT: rosbg %r5, %r3, 39, 63, 58 +; CHECK-NEXT: sllg %r3, %r5, 8 +; CHECK-NEXT: rosbg %r3, %r4, 56, 63, 8 +; CHECK-NEXT: stg %r3, 0(%r2) +; CHECK-NEXT: vlgvf %r3, %v26, 1 +; CHECK-NEXT: sllg %r4, %r3, 62 +; CHECK-NEXT: rosbg %r4, %r0, 2, 32, 31 +; CHECK-NEXT: risbgn %r0, %r1, 4, 162, 29 +; CHECK-NEXT: rosbg %r0, %r3, 35, 63, 62 +; CHECK-NEXT: sllg %r0, %r0, 8 +; CHECK-NEXT: rosbg %r0, %r4, 56, 63, 8 ; CHECK-NEXT: stg %r0, 16(%r2) ; CHECK-NEXT: lmg %r14, %r15, 112(%r15) ; CHECK-NEXT: br %r14 @@ -121,20 +124,19 @@ define void @fun3(<3 x i31>* %src, <3 x i31>* %p) ; CHECK-LABEL: fun3: ; CHECK: # %bb.0: -; CHECK-NEXT: l %r0, 8(%r2) +; CHECK-NEXT: llgf %r0, 8(%r2) ; CHECK-NEXT: lg %r1, 0(%r2) ; CHECK-NEXT: sllg %r2, %r1, 32 ; CHECK-NEXT: lr %r2, %r0 -; CHECK-NEXT: srlg %r0, %r2, 62 -; CHECK-NEXT: st %r2, 8(%r3) -; CHECK-NEXT: rosbg %r0, %r1, 33, 61, 34 -; CHECK-NEXT: sllg %r1, %r0, 62 -; CHECK-NEXT: rosbg %r1, %r2, 2, 32, 0 -; CHECK-NEXT: srlg %r1, %r1, 32 -; CHECK-NEXT: sllg %r0, %r0, 30 -; CHECK-NEXT: lr %r0, %r1 -; CHECK-NEXT: nihh %r0, 8191 -; CHECK-NEXT: stg %r0, 0(%r3) +; CHECK-NEXT: risbgn %r2, %r2, 2, 160, 0 +; CHECK-NEXT: lgr %r4, %r2 +; CHECK-NEXT: rosbg %r2, %r1, 0, 1, 32 +; CHECK-NEXT: rosbg %r4, %r0, 33, 63, 0 +; CHECK-NEXT: srlg %r0, %r2, 32 +; CHECK-NEXT: lr %r1, %r0 +; CHECK-NEXT: nihh %r1, 8191 +; CHECK-NEXT: st %r4, 8(%r3) +; CHECK-NEXT: stg %r1, 0(%r3) ; CHECK-NEXT: br %r14 { %tmp = load <3 x i31>, <3 x i31>* %src diff --git a/llvm/test/CodeGen/Thumb2/thumb2-uxtb.ll b/llvm/test/CodeGen/Thumb2/thumb2-uxtb.ll --- a/llvm/test/CodeGen/Thumb2/thumb2-uxtb.ll +++ b/llvm/test/CodeGen/Thumb2/thumb2-uxtb.ll @@ -159,24 +159,14 @@ } define i32 @test10(i32 %p0) { -; CHECK-DSP-LABEL: test10: -; CHECK-DSP: @ %bb.0: -; CHECK-DSP-NEXT: mov.w r1, #16253176 -; CHECK-DSP-NEXT: and.w r0, r1, r0, lsr #7 -; CHECK-DSP-NEXT: lsrs r1, r0, #5 -; CHECK-DSP-NEXT: uxtb16 r1, r1 -; CHECK-DSP-NEXT: add r0, r1 -; CHECK-DSP-NEXT: bx lr -; -; CHECK-NO-DSP-LABEL: test10: -; CHECK-NO-DSP: @ %bb.0: -; CHECK-NO-DSP-NEXT: mov.w r1, #16253176 -; CHECK-NO-DSP-NEXT: and.w r0, r1, r0, lsr #7 -; CHECK-NO-DSP-NEXT: mov.w r1, #458759 -; CHECK-NO-DSP-NEXT: and.w r1, r1, r0, lsr #5 -; CHECK-NO-DSP-NEXT: add r0, r1 -; CHECK-NO-DSP-NEXT: bx lr - +; CHECK-LABEL: test10: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov.w r1, #16253176 +; CHECK-NEXT: mov.w r2, #458759 +; CHECK-NEXT: and.w r1, r1, r0, lsr #7 +; CHECK-NEXT: and.w r0, r2, r0, lsr #12 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: bx lr %tmp1 = lshr i32 %p0, 7 ; [#uses=1] %tmp2 = and i32 %tmp1, 16253176 ; [#uses=2] %tmp4 = lshr i32 %tmp2, 5 ; [#uses=1] diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll --- a/llvm/test/CodeGen/X86/ctpop-combine.ll +++ b/llvm/test/CodeGen/X86/ctpop-combine.ll @@ -88,20 +88,19 @@ ; ; NO-POPCOUNT-LABEL: test4: ; NO-POPCOUNT: # %bb.0: -; NO-POPCOUNT-NEXT: # kill: def $edi killed $edi def $rdi -; NO-POPCOUNT-NEXT: andb $127, %dil -; NO-POPCOUNT-NEXT: movl %edi, %eax -; NO-POPCOUNT-NEXT: shrb %al -; NO-POPCOUNT-NEXT: andb $21, %al -; NO-POPCOUNT-NEXT: subb %al, %dil ; NO-POPCOUNT-NEXT: movl %edi, %eax +; NO-POPCOUNT-NEXT: andb $127, %al +; NO-POPCOUNT-NEXT: shrb %dil +; NO-POPCOUNT-NEXT: andb $21, %dil +; NO-POPCOUNT-NEXT: subb %dil, %al +; NO-POPCOUNT-NEXT: movl %eax, %ecx +; NO-POPCOUNT-NEXT: andb $51, %cl +; NO-POPCOUNT-NEXT: shrb $2, %al ; NO-POPCOUNT-NEXT: andb $51, %al -; NO-POPCOUNT-NEXT: shrb $2, %dil -; NO-POPCOUNT-NEXT: andb $51, %dil -; NO-POPCOUNT-NEXT: addb %al, %dil -; NO-POPCOUNT-NEXT: movl %edi, %eax -; NO-POPCOUNT-NEXT: shrb $4, %al -; NO-POPCOUNT-NEXT: addl %edi, %eax +; NO-POPCOUNT-NEXT: addb %cl, %al +; NO-POPCOUNT-NEXT: movl %eax, %ecx +; NO-POPCOUNT-NEXT: shrb $4, %cl +; NO-POPCOUNT-NEXT: addl %ecx, %eax ; NO-POPCOUNT-NEXT: andb $15, %al ; NO-POPCOUNT-NEXT: # kill: def $al killed $al killed $eax ; NO-POPCOUNT-NEXT: retq diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll --- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -41,7 +41,7 @@ ; X86-NEXT: shll $16, %eax ; X86-NEXT: orl %edx, %eax ; X86-NEXT: orl $384, %eax # imm = 0x180 -; X86-NEXT: andl $16777088, %eax # imm = 0xFFFF80 +; X86-NEXT: andl $-128, %eax ; X86-NEXT: movw %ax, (%ecx) ; X86-NEXT: retl ; @@ -53,7 +53,7 @@ ; X64-NEXT: shll $16, %ecx ; X64-NEXT: orl %eax, %ecx ; X64-NEXT: orl $384, %ecx # imm = 0x180 -; X64-NEXT: andl $16777088, %ecx # imm = 0xFFFF80 +; X64-NEXT: andl $-128, %ecx ; X64-NEXT: movw %cx, (%rdi) ; X64-NEXT: retq %b = load i24, i24* %a, align 1 @@ -121,12 +121,11 @@ ; X64-NEXT: shll $16, %ecx ; X64-NEXT: orl %eax, %ecx ; X64-NEXT: shlq $32, %rcx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: orq %rcx, %rax -; X64-NEXT: orq $384, %rax # imm = 0x180 -; X64-NEXT: movl %eax, (%rdi) -; X64-NEXT: shrq $32, %rax +; X64-NEXT: movl (%rdi), %edx +; X64-NEXT: orq %rcx, %rdx +; X64-NEXT: orq $384, %rdx # imm = 0x180 ; X64-NEXT: movw %ax, 4(%rdi) +; X64-NEXT: movl %edx, (%rdi) ; X64-NEXT: retq %aa = load i56, i56* %a, align 1 %b = or i56 %aa, 384 @@ -191,15 +190,14 @@ ; X64-NEXT: shll $16, %edx ; X64-NEXT: orl %ecx, %edx ; X64-NEXT: shlq $32, %rdx -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: orq %rdx, %rcx +; X64-NEXT: movl (%rdi), %esi +; X64-NEXT: orq %rdx, %rsi ; X64-NEXT: shlq $13, %rax ; X64-NEXT: movabsq $72057594037919743, %rdx # imm = 0xFFFFFFFFFFDFFF -; X64-NEXT: andq %rcx, %rdx +; X64-NEXT: andq %rsi, %rdx ; X64-NEXT: orq %rax, %rdx +; X64-NEXT: movw %cx, 4(%rdi) ; X64-NEXT: movl %edx, (%rdi) -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: movw %dx, 4(%rdi) ; X64-NEXT: retq %extbit = zext i1 %bit to i56 %b = load i56, i56* %a, align 1 diff --git a/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll b/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll --- a/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll +++ b/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll @@ -5,8 +5,9 @@ ; CHECK-LABEL: t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movzwl 0, %eax -; CHECK-NEXT: orl $2, %eax -; CHECK-NEXT: movw %ax, 0 +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: orl $2, %ecx +; CHECK-NEXT: movw %cx, 0 ; CHECK-NEXT: shrl $3, %eax ; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/load-local-v3i129.ll b/llvm/test/CodeGen/X86/load-local-v3i129.ll --- a/llvm/test/CodeGen/X86/load-local-v3i129.ll +++ b/llvm/test/CodeGen/X86/load-local-v3i129.ll @@ -8,12 +8,14 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: shrdq $2, %rcx, %rax +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: shlq $62, %rdx ; CHECK-NEXT: shrq $2, %rcx -; CHECK-NEXT: leaq 1(,%rax,4), %rdx -; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: shrdq $62, %rcx, %rax +; CHECK-NEXT: shldq $2, %rdx, %rcx +; CHECK-NEXT: andq $-4, %rax +; CHECK-NEXT: orq $1, %rax ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: orq $-2, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq $-1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: popq %rax diff --git a/llvm/test/CodeGen/X86/load-local-v4i5.ll b/llvm/test/CodeGen/X86/load-local-v4i5.ll --- a/llvm/test/CodeGen/X86/load-local-v4i5.ll +++ b/llvm/test/CodeGen/X86/load-local-v4i5.ll @@ -11,6 +11,9 @@ ; CHECK-NEXT: movb -{{[0-9]+}}(%rsp), %cl ; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movzbl %cl, %edi +; CHECK-NEXT: shrb %cl +; CHECK-NEXT: movb %cl, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: andl $31, %eax ; CHECK-NEXT: andl $31, %esi ; CHECK-NEXT: shll $5, %esi @@ -18,16 +21,12 @@ ; CHECK-NEXT: andl $31, %edx ; CHECK-NEXT: shll $10, %edx ; CHECK-NEXT: orl %esi, %edx -; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: shll $15, %ecx -; CHECK-NEXT: orl %edx, %ecx -; CHECK-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: shrl $16, %ecx -; CHECK-NEXT: andl $15, %ecx -; CHECK-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: cmpb $31, %al +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shll $15, %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: cmpb $31, %dil ; CHECK-NEXT: je .LBB0_2 ; CHECK-NEXT: # %bb.1: # %Then ; CHECK-NEXT: int3 diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -519,21 +519,20 @@ ; AVX512F-NEXT: kshiftrw $8, %k1, %k2 ; AVX512F-NEXT: vcompresspd %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: movzbl %al, %eax -; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl %ecx -; AVX512F-NEXT: andl $-43, %ecx -; AVX512F-NEXT: subl %ecx, %eax -; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333 -; AVX512F-NEXT: shrl $2, %eax +; AVX512F-NEXT: movzbl %al, %ecx +; AVX512F-NEXT: shrl %eax +; AVX512F-NEXT: andl $85, %eax +; AVX512F-NEXT: subl %eax, %ecx +; AVX512F-NEXT: movl %ecx, %eax ; AVX512F-NEXT: andl $858993459, %eax ## imm = 0x33333333 -; AVX512F-NEXT: addl %ecx, %eax -; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $4, %ecx +; AVX512F-NEXT: shrl $2, %ecx +; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333 ; AVX512F-NEXT: addl %eax, %ecx -; AVX512F-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F -; AVX512F-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 +; AVX512F-NEXT: movl %ecx, %eax +; AVX512F-NEXT: shrl $4, %eax +; AVX512F-NEXT: addl %ecx, %eax +; AVX512F-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F +; AVX512F-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101 ; AVX512F-NEXT: shrl $24, %eax ; AVX512F-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} ; AVX512F-NEXT: vzeroupper @@ -573,21 +572,20 @@ ; AVX512VLBW-NEXT: kshiftrw $8, %k1, %k2 ; AVX512VLBW-NEXT: vcompresspd %zmm0, (%rdi) {%k1} ; AVX512VLBW-NEXT: kmovd %k1, %eax -; AVX512VLBW-NEXT: movzbl %al, %eax -; AVX512VLBW-NEXT: movl %eax, %ecx -; AVX512VLBW-NEXT: shrl %ecx -; AVX512VLBW-NEXT: andl $-43, %ecx -; AVX512VLBW-NEXT: subl %ecx, %eax -; AVX512VLBW-NEXT: movl %eax, %ecx -; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333 -; AVX512VLBW-NEXT: shrl $2, %eax +; AVX512VLBW-NEXT: movzbl %al, %ecx +; AVX512VLBW-NEXT: shrl %eax +; AVX512VLBW-NEXT: andl $85, %eax +; AVX512VLBW-NEXT: subl %eax, %ecx +; AVX512VLBW-NEXT: movl %ecx, %eax ; AVX512VLBW-NEXT: andl $858993459, %eax ## imm = 0x33333333 -; AVX512VLBW-NEXT: addl %ecx, %eax -; AVX512VLBW-NEXT: movl %eax, %ecx -; AVX512VLBW-NEXT: shrl $4, %ecx +; AVX512VLBW-NEXT: shrl $2, %ecx +; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333 ; AVX512VLBW-NEXT: addl %eax, %ecx -; AVX512VLBW-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F -; AVX512VLBW-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 +; AVX512VLBW-NEXT: movl %ecx, %eax +; AVX512VLBW-NEXT: shrl $4, %eax +; AVX512VLBW-NEXT: addl %ecx, %eax +; AVX512VLBW-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F +; AVX512VLBW-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101 ; AVX512VLBW-NEXT: shrl $24, %eax ; AVX512VLBW-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} ; AVX512VLBW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll --- a/llvm/test/CodeGen/X86/mul128.ll +++ b/llvm/test/CodeGen/X86/mul128.ll @@ -106,13 +106,14 @@ define void @PR13897() nounwind { ; X64-LABEL: PR13897: ; X64: # %bb.0: # %"0x0" -; X64-NEXT: movl bbb(%rip), %ecx +; X64-NEXT: movq bbb(%rip), %rsi +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: shlq $32, %rsi ; X64-NEXT: movabsq $4294967297, %rdx # imm = 0x100000001 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rdx ; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: shlq $32, %rcx -; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: addq %rsi, %rdx ; X64-NEXT: movq %rax, aaa(%rip) ; X64-NEXT: movq %rdx, aaa+8(%rip) ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shift-mask.ll b/llvm/test/CodeGen/X86/shift-mask.ll --- a/llvm/test/CodeGen/X86/shift-mask.ll +++ b/llvm/test/CodeGen/X86/shift-mask.ll @@ -555,10 +555,11 @@ ; X86-LABEL: test_i64_lshr_lshr_1: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: shldl $3, %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (,%ecx,8), %edx +; X86-NEXT: shldl $3, %eax, %ecx ; X86-NEXT: shll $3, %eax -; X86-NEXT: shrdl $5, %edx, %eax +; X86-NEXT: shrdl $5, %ecx, %eax ; X86-NEXT: shrl $5, %edx ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -285,15 +285,14 @@ ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl %cx, %ecx -; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrl $16, %edx -; X86-NEXT: shll $16, %ecx +; X86-NEXT: shll $17, %edx +; X86-NEXT: shrl $15, %ecx +; X86-NEXT: andl $1, %ecx ; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax -; X86-NEXT: pushl %edx ; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %edx ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: cmpl $131071, %eax # imm = 0x1FFFF diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll --- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll +++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll @@ -57,15 +57,15 @@ ; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000 ; CHECK-NEXT: cmovll %ecx, %edx ; CHECK-NEXT: pextrw $1, %xmm0, %esi -; CHECK-NEXT: movswl %si, %edi -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shrl $16, %eax -; CHECK-NEXT: leal (%rdi,%rdi), %esi -; CHECK-NEXT: shrdw $15, %ax, %si -; CHECK-NEXT: sarl $15, %edi -; CHECK-NEXT: cmpl $16384, %edi # imm = 0x4000 +; CHECK-NEXT: leal (%rsi,%rsi), %edi +; CHECK-NEXT: movswl %si, %eax +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: shrl $16, %esi +; CHECK-NEXT: shldw $1, %di, %si +; CHECK-NEXT: sarl $15, %eax +; CHECK-NEXT: cmpl $16384, %eax # imm = 0x4000 ; CHECK-NEXT: cmovgel %r8d, %esi -; CHECK-NEXT: cmpl $-16384, %edi # imm = 0xC000 +; CHECK-NEXT: cmpl $-16384, %eax # imm = 0xC000 ; CHECK-NEXT: cmovll %ecx, %esi ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: cwtl @@ -82,11 +82,11 @@ ; CHECK-NEXT: pinsrw $1, %esi, %xmm1 ; CHECK-NEXT: pinsrw $2, %edx, %xmm1 ; CHECK-NEXT: pextrw $3, %xmm0, %eax +; CHECK-NEXT: leal (,%rax,4), %edx ; CHECK-NEXT: cwtl -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: shrl $14, %edx -; CHECK-NEXT: leal (,%rax,4), %esi -; CHECK-NEXT: shrdw $15, %dx, %si +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: shrl $14, %esi +; CHECK-NEXT: shldw $1, %dx, %si ; CHECK-NEXT: sarl $14, %eax ; CHECK-NEXT: cmpl $16384, %eax # imm = 0x4000 ; CHECK-NEXT: cmovgel %r8d, %esi