diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2207,24 +2207,6 @@ return getConstant(NewVal, SDLoc(V), V.getValueType()); break; } - case ISD::SRL: - // Only look at single-use SRLs. - if (!V.getNode()->hasOneUse()) - break; - if (auto *RHSC = dyn_cast(V.getOperand(1))) { - // See if we can recursively simplify the LHS. - unsigned Amt = RHSC->getZExtValue(); - - // Watch out for shift count overflow though. - if (Amt >= DemandedBits.getBitWidth()) - break; - APInt SrcDemandedBits = DemandedBits << Amt; - if (SDValue SimplifyLHS = - GetDemandedBits(V.getOperand(0), SrcDemandedBits)) - return getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS, - V.getOperand(1)); - } - break; case ISD::AND: { // X & -1 -> X (ignoring bits which aren't demanded). // Also handle the case where masked out bits in X are known to be zero. diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1445,6 +1445,16 @@ // low bits known zero. Known.Zero.setLowBits(ShAmt); + // Attempt to avoid multi-use ops if we don't need anything from them. + if (!InDemandedMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) { + SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( + Op0, InDemandedMask, DemandedElts, TLO.DAG, Depth + 1); + if (DemandedOp0) { + SDValue NewOp = TLO.DAG.getNode(ISD::SHL, dl, VT, DemandedOp0, Op1); + return TLO.CombineTo(Op, NewOp); + } + } + // Try shrinking the operation as long as the shift amount will still be // in range. if ((ShAmt < DemandedBits.getActiveBits()) && @@ -1506,6 +1516,16 @@ Known.One.lshrInPlace(ShAmt); // High bits known zero. Known.Zero.setHighBits(ShAmt); + + // Attempt to avoid multi-use ops if we don't need anything from them. + if (!InDemandedMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) { + SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( + Op0, InDemandedMask, DemandedElts, TLO.DAG, Depth + 1); + if (DemandedOp0) { + SDValue NewOp = TLO.DAG.getNode(ISD::SRL, dl, VT, DemandedOp0, Op1); + return TLO.CombineTo(Op, NewOp); + } + } } break; } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -3992,77 +3992,73 @@ ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bfe_i32 s10, s2, 0xf0000 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 30 +; GCN-NEXT: s_bfe_i32 s1, s0, 0xf0000 +; GCN-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GCN-NEXT: v_cvt_f32_i32_e32 v5, s10 +; GCN-NEXT: s_xor_b32 s1, s10, s1 +; GCN-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GCN-NEXT: s_or_b32 s1, s1, 1 +; GCN-NEXT: v_mov_b32_e32 v7, s1 +; GCN-NEXT: s_lshr_b32 s9, s0, 15 +; GCN-NEXT: v_mul_f32_e32 v6, v5, v6 +; GCN-NEXT: v_trunc_f32_e32 v6, v6 +; GCN-NEXT: v_mad_f32 v5, -v6, v4, v5 +; GCN-NEXT: v_cvt_i32_f32_e32 v6, v6 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| +; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc +; GCN-NEXT: s_bfe_i32 s1, s2, 0xf000f +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GCN-NEXT: v_mul_lo_u32 v4, v4, s0 +; GCN-NEXT: s_bfe_i32 s0, s0, 0xf000f +; GCN-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GCN-NEXT: v_cvt_f32_i32_e32 v6, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GCN-NEXT: s_movk_i32 s3, 0x7fff -; GCN-NEXT: s_and_b32 s11, s0, s3 -; GCN-NEXT: s_bfe_i32 s11, s11, 0xf0000 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s11 -; GCN-NEXT: s_and_b32 s9, s2, s3 -; GCN-NEXT: s_bfe_i32 s9, s9, 0xf0000 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s9 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: s_xor_b32 s9, s9, s11 -; GCN-NEXT: s_ashr_i32 s9, s9, 30 -; GCN-NEXT: s_or_b32 s9, s9, 1 -; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: s_bfe_u32 s12, s0, 0xf000f -; GCN-NEXT: v_alignbit_b32 v1, s1, v1, 30 -; GCN-NEXT: v_mul_lo_u32 v2, v2, s0 -; GCN-NEXT: s_lshr_b32 s1, s0, 15 -; GCN-NEXT: s_bfe_i32 s0, s12, 0xf0000 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GCN-NEXT: s_bfe_u32 s10, s2, 0xf000f -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 -; GCN-NEXT: s_lshr_b32 s8, s2, 15 -; GCN-NEXT: s_bfe_i32 s2, s10, 0xf0000 -; GCN-NEXT: v_cvt_f32_i32_e32 v4, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GCN-NEXT: s_xor_b32 s0, s2, s0 +; GCN-NEXT: s_xor_b32 s0, s1, s0 +; GCN-NEXT: v_and_b32_e32 v3, s3, v2 +; GCN-NEXT: v_mul_f32_e32 v7, v6, v7 +; GCN-NEXT: v_trunc_f32_e32 v7, v7 +; GCN-NEXT: v_mad_f32 v6, -v7, v5, v6 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, s2, v4 +; GCN-NEXT: v_bfe_i32 v2, v2, 0, 15 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| +; GCN-NEXT: v_cvt_i32_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_i32_e32 v6, v2 ; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 -; GCN-NEXT: v_trunc_f32_e32 v5, v5 -; GCN-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_and_b32_e32 v1, s3, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc -; GCN-NEXT: v_bfe_i32 v4, v1, 0, 15 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_cvt_f32_i32_e32 v5, v4 -; GCN-NEXT: v_and_b32_e32 v0, s3, v0 -; GCN-NEXT: v_bfe_i32 v6, v0, 0, 15 -; GCN-NEXT: v_cvt_f32_i32_e32 v7, v6 -; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v5 -; GCN-NEXT: v_xor_b32_e32 v4, v6, v4 -; GCN-NEXT: v_ashrrev_i32_e32 v4, 30, v4 -; GCN-NEXT: v_or_b32_e32 v4, 1, v4 -; GCN-NEXT: v_mul_f32_e32 v6, v7, v8 -; GCN-NEXT: v_trunc_f32_e32 v6, v6 -; GCN-NEXT: v_mad_f32 v7, -v6, v5, v7 -; GCN-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v5| -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_mul_lo_u32 v1, v4, v1 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc +; GCN-NEXT: v_and_b32_e32 v1, s3, v0 +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 15 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GCN-NEXT: v_cvt_f32_i32_e32 v7, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GCN-NEXT: v_or_b32_e32 v0, 1, v0 +; GCN-NEXT: v_mul_f32_e32 v2, v7, v8 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mad_f32 v7, -v2, v6, v7 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: v_mul_lo_u32 v5, v5, s9 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mul_lo_u32 v0, v0, v3 +; GCN-NEXT: s_lshr_b32 s8, s2, 15 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s8, v5 ; GCN-NEXT: v_and_b32_e32 v2, s3, v2 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 -; GCN-NEXT: v_and_b32_e32 v3, s3, v3 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: v_and_b32_e32 v3, s3, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 15, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 ; GCN-NEXT: v_or_b32_e32 v0, v2, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -463,10 +463,10 @@ ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v2i16: @@ -529,14 +529,14 @@ ; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v7 -; SI-NEXT: v_and_b32_e32 v3, s5, v3 +; SI-NEXT: v_and_b32_e32 v4, s5, v3 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, s5, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -576,18 +576,16 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:2 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -789,7 +787,7 @@ ; VI-NEXT: flat_load_ubyte v9, v[12:13] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: flat_load_ubyte v1, v[2:3] -; VI-NEXT: flat_load_ubyte v2, v[4:5] +; VI-NEXT: flat_load_ubyte v3, v[4:5] ; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v6 ; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) @@ -799,12 +797,12 @@ ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 +; VI-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v9 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 +; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1019,18 +1017,16 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:2 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -1046,25 +1042,24 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] +; VI-NEXT: flat_load_ubyte v8, v[2:3] +; VI-NEXT: flat_load_ubyte v2, v[4:5] +; VI-NEXT: flat_load_ubyte v3, v[6:7] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v8 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v4 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -657,25 +657,26 @@ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: v_and_b32_e32 v5, 15, v5 -; SI-NEXT: v_and_b32_e32 v7, s4, v3 -; SI-NEXT: v_sub_i32_e32 v8, vcc, 16, v5 -; SI-NEXT: v_lshr_b32_e32 v7, v7, v5 -; SI-NEXT: v_lshl_b32_e32 v1, v1, v8 -; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_and_b32_e32 v8, s4, v3 +; SI-NEXT: v_sub_i32_e32 v9, vcc, 16, v5 +; SI-NEXT: v_lshr_b32_e32 v8, v8, v5 +; SI-NEXT: v_lshl_b32_e32 v1, v1, v9 +; SI-NEXT: v_and_b32_e32 v7, s4, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_and_b32_e32 v4, 15, v4 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; SI-NEXT: v_and_b32_e32 v3, 15, v4 -; SI-NEXT: v_sub_i32_e32 v5, vcc, 16, v3 -; SI-NEXT: v_and_b32_e32 v6, s4, v2 -; SI-NEXT: v_lshr_b32_e32 v4, v6, v3 -; SI-NEXT: v_lshl_b32_e32 v0, v0, v5 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; SI-NEXT: v_lshr_b32_e32 v5, v7, v4 +; SI-NEXT: v_sub_i32_e32 v7, vcc, 16, v4 +; SI-NEXT: v_lshl_b32_e32 v0, v0, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mov_b32_e32 v6, 0xffff ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, v6, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_and_b32_e32 v1, v6, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v2i16: @@ -853,42 +854,42 @@ ; SI-NEXT: v_sub_i32_e32 v17, vcc, 16, v11 ; SI-NEXT: v_lshr_b32_e32 v16, v16, v11 ; SI-NEXT: v_lshl_b32_e32 v3, v3, v17 +; SI-NEXT: v_and_b32_e32 v15, s4, v6 ; SI-NEXT: v_or_b32_e32 v3, v3, v16 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 +; SI-NEXT: v_and_b32_e32 v10, 15, v10 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; SI-NEXT: v_and_b32_e32 v7, 15, v10 -; SI-NEXT: v_sub_i32_e32 v11, vcc, 16, v7 -; SI-NEXT: v_and_b32_e32 v15, s4, v6 -; SI-NEXT: v_lshr_b32_e32 v10, v15, v7 -; SI-NEXT: v_lshl_b32_e32 v2, v2, v11 -; SI-NEXT: v_or_b32_e32 v2, v2, v10 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; SI-NEXT: v_mov_b32_e32 v12, 0xffff +; SI-NEXT: v_lshr_b32_e32 v11, v15, v10 +; SI-NEXT: v_sub_i32_e32 v15, vcc, 16, v10 +; SI-NEXT: v_lshl_b32_e32 v2, v2, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, 15, v9 +; SI-NEXT: v_mov_b32_e32 v12, 0xffff +; SI-NEXT: v_sub_i32_e32 v9, vcc, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 ; SI-NEXT: v_and_b32_e32 v2, v12, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 15, v9 -; SI-NEXT: v_sub_i32_e32 v7, vcc, 16, v3 ; SI-NEXT: v_and_b32_e32 v14, s4, v5 -; SI-NEXT: v_lshr_b32_e32 v6, v14, v3 -; SI-NEXT: v_lshl_b32_e32 v1, v1, v7 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v6 -; SI-NEXT: v_and_b32_e32 v3, 15, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_lshr_b32_e32 v7, v14, v6 +; SI-NEXT: v_lshl_b32_e32 v1, v1, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; SI-NEXT: v_sub_i32_e32 v6, vcc, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 15, v8 +; SI-NEXT: v_sub_i32_e32 v7, vcc, 16, v5 ; SI-NEXT: v_and_b32_e32 v13, s4, v4 -; SI-NEXT: v_lshr_b32_e32 v5, v13, v3 -; SI-NEXT: v_lshl_b32_e32 v0, v0, v6 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_lshr_b32_e32 v6, v13, v5 +; SI-NEXT: v_lshl_b32_e32 v0, v0, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, v12, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, v12, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -2004,42 +2004,26 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s6, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008 -; GFX7-NEXT: s_lshr_b32 s11, s5, 16 -; GFX7-NEXT: s_lshr_b32 s12, s5, 24 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: s_lshr_b32 s11, s6, 8 +; GFX7-NEXT: s_lshr_b32 s10, s6, 16 +; GFX7-NEXT: s_lshr_b32 s5, s4, 24 ; GFX7-NEXT: s_lshr_b32 s7, s4, 16 -; GFX7-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-NEXT: s_lshr_b32 s9, s4, 24 -; GFX7-NEXT: v_mov_b32_e32 v1, s12 -; GFX7-NEXT: s_mul_i32 s4, s4, s5 -; GFX7-NEXT: v_mul_u32_u24_e32 v1, s9, v1 -; GFX7-NEXT: v_mul_u32_u24_e32 v2, s7, v2 -; GFX7-NEXT: v_mul_u32_u24_e32 v3, s6, v3 -; GFX7-NEXT: s_and_b32 s5, s4, s8 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v2, s5, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: s_lshr_b32 s8, s4, 8 +; GFX7-NEXT: s_mul_i32 s4, s4, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-NEXT: s_lshr_b32 s9, s6, 24 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2057,29 +2041,29 @@ ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_lshr_b32 s4, s1, 24 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_mul_i32 s0, s0, s1 +; GFX8-NEXT: s_lshr_b32 s4, s1, 24 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 ; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v6, s4 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX8-NEXT: v_mul_u32_u24_e32 v5, s3, v5 ; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2092,30 +2076,30 @@ ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v1, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v2, s4, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NODL-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 -; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_or_b32_e32 v3, v2, v0 +; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ubyte v5, v[0:1], off +; GFX9-NODL-NEXT: global_load_ubyte v6, v[0:1], off ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_add_u32_e32 v3, v3, v6 +; GFX9-NODL-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-NODL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -2128,30 +2112,30 @@ ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v2, s4, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-DL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 -; GFX9-DL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v0 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v5, v[0:1], off +; GFX9-DL-NEXT: global_load_ubyte v6, v[0:1], off ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v6 +; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2172,24 +2156,24 @@ ; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1 ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 24 ; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 24 -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s3 -; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s0, s1 ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v5 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s0 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s2, s3 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v7, v3, v2 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v7, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -1585,19 +1585,19 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s15, s6, 0x40018 -; GFX7-NEXT: s_bfe_i32 s16, s6, 0x40014 -; GFX7-NEXT: s_bfe_i32 s17, s6, 0x40010 -; GFX7-NEXT: s_bfe_i32 s18, s6, 0x40000 -; GFX7-NEXT: s_bfe_i32 s19, s6, 0x40004 -; GFX7-NEXT: s_bfe_i32 s20, s6, 0x40008 -; GFX7-NEXT: s_ashr_i32 s14, s6, 28 -; GFX7-NEXT: s_bfe_i32 s6, s6, 0x4000c -; GFX7-NEXT: s_ashr_i32 s5, s4, 28 +; GFX7-NEXT: s_ashr_i32 s6, s4, 28 +; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40018 +; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40014 +; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40010 +; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40000 +; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40004 +; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40008 +; GFX7-NEXT: s_ashr_i32 s14, s5, 28 +; GFX7-NEXT: s_bfe_i32 s5, s5, 0x4000c ; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40018 ; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40014 ; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40010 @@ -1608,32 +1608,32 @@ ; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40008 ; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: s_bfe_i32 s4, s4, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mul_i32_i24_e32 v2, s13, v2 ; GFX7-NEXT: v_mul_i32_i24_e32 v3, s12, v3 ; GFX7-NEXT: v_mul_i32_i24_e32 v4, s11, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v4, s8, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX7-NEXT: v_alignbit_b32 v4, v2, v3, 16 +; GFX7-NEXT: v_and_b32_e32 v1, s8, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-NEXT: v_mov_b32_e32 v6, s16 ; GFX7-NEXT: v_mov_b32_e32 v7, s15 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX7-NEXT: v_mad_i32_i24 v0, s10, v5, v0 ; GFX7-NEXT: v_mad_i32_i24 v0, s9, v6, v0 ; GFX7-NEXT: v_mad_i32_i24 v0, s7, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s6, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1968,52 +1968,48 @@ ; GFX7-NEXT: s_bfe_i32 s12, s4, 0x40010 ; GFX7-NEXT: v_mov_b32_e32 v4, s19 ; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40014 -; GFX7-NEXT: v_mov_b32_e32 v3, s20 ; GFX7-NEXT: s_bfe_i32 s14, s4, 0x40018 ; GFX7-NEXT: v_mov_b32_e32 v2, s21 +; GFX7-NEXT: v_mov_b32_e32 v3, s20 ; GFX7-NEXT: s_ashr_i32 s4, s4, 28 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1 -; GFX7-NEXT: v_mul_i32_i24_e32 v2, s14, v2 -; GFX7-NEXT: v_mul_i32_i24_e32 v3, s13, v3 -; GFX7-NEXT: v_mul_i32_i24_e32 v9, s12, v4 -; GFX7-NEXT: v_mul_i32_i24_e32 v5, s11, v5 +; GFX7-NEXT: v_mul_i32_i24_e32 v9, s4, v1 +; GFX7-NEXT: v_mul_i32_i24_e32 v10, s14, v2 +; GFX7-NEXT: v_mul_i32_i24_e32 v11, s13, v3 +; GFX7-NEXT: v_mul_i32_i24_e32 v12, s12, v4 +; GFX7-NEXT: v_mul_i32_i24_e32 v13, s11, v5 ; GFX7-NEXT: v_mul_i32_i24_e32 v6, s10, v6 ; GFX7-NEXT: v_mul_i32_i24_e32 v7, s7, v7 ; GFX7-NEXT: v_mul_i32_i24_e32 v8, s6, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_and_b32_e32 v9, s8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_and_b32_e32 v10, s8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_and_b32_e32 v12, s8, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX7-NEXT: v_and_b32_e32 v6, s8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_and_b32_e32 v8, s8, v8 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v9, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v8, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s9, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v5, s9, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v3 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 8 -; GFX7-NEXT: v_alignbit_b32 v5, v1, v2, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX7-NEXT: v_or_b32_e32 v7, v8, v7 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v9 +; GFX7-NEXT: v_or_b32_e32 v10, v12, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-NEXT: v_and_b32_e32 v9, s9, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v7, s9, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX7-NEXT: v_or_b32_e32 v8, v9, v8 +; GFX7-NEXT: v_alignbit_b32 v7, v8, v6, 8 +; GFX7-NEXT: v_alignbit_b32 v8, v8, v6, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s11, v5, v0 ; GFX7-NEXT: v_mad_i32_i24 v0, s12, v4, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_mad_i32_i24 v0, s13, v3, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2023,81 +2019,81 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_mov_b32 s33, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b32 s11, s1, 24 -; GFX8-NEXT: s_lshl_b32 s15, s1, 16 -; GFX8-NEXT: s_ashr_i64 s[20:21], s[2:3], 60 -; GFX8-NEXT: s_lshl_b32 s23, s3, 24 -; GFX8-NEXT: s_lshl_b32 s25, s3, 28 -; GFX8-NEXT: s_lshl_b32 s27, s3, 16 -; GFX8-NEXT: s_ashr_i64 s[8:9], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s13, s1, 28 -; GFX8-NEXT: s_lshl_b32 s17, s3, 8 -; GFX8-NEXT: s_lshl_b32 s19, s3, 12 -; GFX8-NEXT: s_lshl_b32 s21, s3, 4 -; GFX8-NEXT: s_lshl_b32 s3, s3, 20 -; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s19, s3, 8 +; GFX8-NEXT: s_lshl_b32 s21, s3, 12 +; GFX8-NEXT: s_lshl_b32 s23, s3, 16 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s15, s1, 24 +; GFX8-NEXT: s_lshl_b32 s25, s3, 20 +; GFX8-NEXT: s_lshl_b32 s27, s3, 24 +; GFX8-NEXT: s_lshl_b32 s17, s3, 4 +; GFX8-NEXT: s_lshl_b32 s3, s3, 28 +; GFX8-NEXT: s_lshl_b32 s7, s1, 8 +; GFX8-NEXT: s_lshl_b32 s9, s1, 12 +; GFX8-NEXT: s_lshl_b32 s11, s1, 16 +; GFX8-NEXT: s_lshl_b32 s13, s1, 20 +; GFX8-NEXT: s_lshl_b32 s5, s1, 4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 28 ; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 -; GFX8-NEXT: s_ashr_i64 s[22:23], s[22:23], 60 -; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60 ; GFX8-NEXT: s_ashr_i64 s[26:27], s[26:27], 60 -; GFX8-NEXT: s_lshl_b32 s5, s1, 8 -; GFX8-NEXT: s_lshl_b32 s7, s1, 12 -; GFX8-NEXT: s_lshl_b32 s9, s1, 4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 20 ; GFX8-NEXT: s_ashr_i64 s[2:3], s[2:3], 60 -; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX8-NEXT: v_mov_b32_e32 v3, s16 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s2 ; GFX8-NEXT: v_mov_b32_e32 v6, s26 ; GFX8-NEXT: v_mov_b32_e32 v7, s14 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[20:21], 60 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[22:23], 60 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60 +; GFX8-NEXT: v_mul_i32_i24_sdwa v3, v4, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_i32_i24_e32 v4, s0, v5 +; GFX8-NEXT: v_mul_i32_i24_sdwa v5, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 ; GFX8-NEXT: v_mov_b32_e32 v8, s24 ; GFX8-NEXT: v_mov_b32_e32 v9, s22 ; GFX8-NEXT: v_mov_b32_e32 v10, s10 -; GFX8-NEXT: v_mul_i32_i24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_i32_i24_e32 v7, s12, v8 -; GFX8-NEXT: v_mul_i32_i24_sdwa v8, v10, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX8-NEXT: v_mov_b32_e32 v5, s2 -; GFX8-NEXT: v_mul_i32_i24_e32 v5, s0, v5 -; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 -; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v6, s33, v7 -; GFX8-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 -; GFX8-NEXT: v_mov_b32_e32 v3, s20 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: s_ashr_i64 s[30:31], s[20:21], 60 -; GFX8-NEXT: v_mul_i32_i24_sdwa v3, v4, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 -; GFX8-NEXT: v_mov_b32_e32 v4, s18 -; GFX8-NEXT: v_mov_b32_e32 v12, s16 -; GFX8-NEXT: v_mov_b32_e32 v13, s4 -; GFX8-NEXT: s_ashr_i64 s[28:29], s[8:9], 60 -; GFX8-NEXT: v_mov_b32_e32 v11, s30 -; GFX8-NEXT: v_mul_i32_i24_e32 v4, s6, v4 -; GFX8-NEXT: v_mul_i32_i24_sdwa v10, v13, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v5 -; GFX8-NEXT: v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mul_i32_i24_e32 v9, s28, v11 -; GFX8-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v11, s20 +; GFX8-NEXT: v_mov_b32_e32 v12, s18 +; GFX8-NEXT: v_mov_b32_e32 v13, s6 +; GFX8-NEXT: v_mul_i32_i24_e32 v6, s12, v8 +; GFX8-NEXT: v_mul_i32_i24_sdwa v7, v10, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v4, s33, v4 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_mul_i32_i24_e32 v8, s8, v11 +; GFX8-NEXT: v_mul_i32_i24_sdwa v9, v13, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX8-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: s_ashr_i64 s[30:31], s[16:17], 60 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 +; GFX8-NEXT: v_mov_b32_e32 v14, s30 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v6 +; GFX8-NEXT: v_mul_i32_i24_e32 v10, s28, v14 +; GFX8-NEXT: v_and_b32_e32 v5, s33, v8 +; GFX8-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v10 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2113,79 +2109,79 @@ ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s7, s0, 4 -; GFX9-NEXT: s_lshr_b32 s14, s1, 4 +; GFX9-NEXT: s_lshr_b32 s9, s0, 4 +; GFX9-NEXT: s_lshr_b32 s16, s1, 4 ; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s0 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7 -; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s14 -; GFX9-NEXT: s_lshr_b32 s8, s0, 12 -; GFX9-NEXT: s_lshr_b32 s9, s0, 8 -; GFX9-NEXT: s_lshr_b32 s15, s1, 12 -; GFX9-NEXT: s_lshr_b32 s16, s1, 8 ; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s9 -; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s8 ; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s16 -; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s15 +; GFX9-NEXT: s_lshr_b32 s5, s0, 20 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s12, s1, 20 +; GFX9-NEXT: s_lshr_b32 s13, s1, 16 +; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s6 +; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s13 +; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s12 ; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX9-NEXT: s_lshr_b32 s7, s0, 12 +; GFX9-NEXT: s_lshr_b32 s8, s0, 8 +; GFX9-NEXT: s_lshr_b32 s14, s1, 12 +; GFX9-NEXT: s_lshr_b32 s15, s1, 8 +; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 +; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15 +; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 +; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16 +; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s8 +; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s15 +; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s14 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 +; GFX9-NEXT: v_and_b32_e32 v5, s2, v8 ; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_lshr_b32 s3, s0, 20 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s1, 20 -; GFX9-NEXT: s_lshr_b32 s11, s1, 16 -; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12 +; GFX9-NEXT: v_mul_lo_u16_e32 v6, v6, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_lshr_b32 s3, s0, 28 +; GFX9-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_lshr_b32 s11, s1, 24 ; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4 ; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s3 ; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s11 ; GFX9-NEXT: v_lshlrev_b16_e64 v18, 12, s10 -; GFX9-NEXT: s_lshr_b32 s5, s0, 28 -; GFX9-NEXT: s_lshr_b32 s6, s0, 24 -; GFX9-NEXT: s_lshr_b32 s12, s1, 28 -; GFX9-NEXT: s_lshr_b32 s13, s1, 24 -; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s6 -; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s5 -; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s13 -; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s12 -; GFX9-NEXT: v_or_b32_e32 v5, v3, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v6 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v18 -; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16 ; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v10, v10, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX9-NEXT: v_or_b32_e32 v6, v4, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v8 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v7 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v9 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v4 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2201,79 +2197,79 @@ ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 4 -; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 4 +; GFX9-DL-NEXT: s_lshr_b32 s9, s0, 4 +; GFX9-DL-NEXT: s_lshr_b32 s16, s1, 4 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s14 -; GFX9-DL-NEXT: s_lshr_b32 s8, s0, 12 -; GFX9-DL-NEXT: s_lshr_b32 s9, s0, 8 -; GFX9-DL-NEXT: s_lshr_b32 s15, s1, 12 -; GFX9-DL-NEXT: s_lshr_b32 s16, s1, 8 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s9 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s15 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 20 +; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s12, s1, 20 +; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 16 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s13 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s12 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 12 +; GFX9-DL-NEXT: s_lshr_b32 s8, s0, 8 +; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 12 +; GFX9-DL-NEXT: s_lshr_b32 s15, s1, 8 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v4 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s15 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s14 +; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15 +; GFX9-DL-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v4 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 +; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v8 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_lshr_b32 s3, s0, 20 -; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 20 -; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 16 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v6, v6, v13 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX9-DL-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: s_lshr_b32 s3, s0, 28 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 24 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s11 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v18, 12, s10 -; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 28 -; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 24 -; GFX9-DL-NEXT: s_lshr_b32 s12, s1, 28 -; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 24 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX9-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s13 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s12 -; GFX9-DL-NEXT: v_or_b32_e32 v5, v3, v5 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v6 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v18 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v10, v10, v17 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v5 ; GFX9-DL-NEXT: v_or_b32_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15 -; GFX9-DL-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX9-DL-NEXT: v_or_b32_e32 v6, v4, v8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v8 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v7 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v9 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v6 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2291,83 +2287,83 @@ ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 4 -; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 4 -; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 12 +; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 4 +; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 4 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s14 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 -; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 8 -; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 +; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 16 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s9 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s15 +; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 8 +; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 12 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s16 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v12 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v6 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v14, 12, v14 -; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 20 -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v8 +; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 20 +; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 20 +; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v19, v14 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v7 -; GFX10-DL-NEXT: s_lshr_b32 s10, s1, 20 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v13 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 -; GFX10-DL-NEXT: s_lshr_b32 s11, s1, 16 -; GFX10-DL-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 28 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s10 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v12 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s11 -; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 24 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v8 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v9 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v10 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v5 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s14 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s0 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 16 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v6 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v7 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v9 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v14 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s12 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s13 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v10 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v6, v9 ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s12 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v11 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v13 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s13 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v16 -; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v10 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v15 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, v9, v7 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v11 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v15 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8 +; GFX10-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v19, v13 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v6 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v10 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX10-DL-NEXT: s_lshr_b32 s11, s1, 24 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s3 +; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s11 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s4 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v12 +; GFX10-DL-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v16 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v5 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v6, v12 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s10 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v9 -; GFX10-DL-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v6, v3 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v4 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v7 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v8 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v7, v2, v4 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v7, v6 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v3 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -1997,7 +1997,7 @@ ; GFX7-NEXT: s_bfe_u32 s5, s5, 0x40008 ; GFX7-NEXT: s_bfe_u32 s12, s4, 0x4000c ; GFX7-NEXT: v_mov_b32_e32 v2, s19 -; GFX7-NEXT: v_mul_u32_u24_e32 v2, s12, v2 +; GFX7-NEXT: v_mul_u32_u24_e32 v8, s12, v2 ; GFX7-NEXT: v_mul_u32_u24_e32 v4, s10, v4 ; GFX7-NEXT: s_lshr_b32 s6, s4, 28 ; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 @@ -2008,21 +2008,20 @@ ; GFX7-NEXT: s_bfe_u32 s4, s4, 0x40008 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mul_u32_u24_e32 v1, s4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_mul_u32_u24_e32 v3, s11, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX7-NEXT: v_alignbit_b32 v4, v1, v3, 16 ; GFX7-NEXT: v_mov_b32_e32 v5, s16 ; GFX7-NEXT: v_mov_b32_e32 v6, s15 ; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 @@ -2291,61 +2290,57 @@ ; GFX7-NEXT: s_bfe_u32 s13, s5, 0x4000c ; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40004 ; GFX7-NEXT: s_lshr_b32 s17, s5, 28 -; GFX7-NEXT: v_mov_b32_e32 v8, s13 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40014 ; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40008 ; GFX7-NEXT: s_and_b32 s16, s5, 15 ; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40018 -; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s5, s5, 0x40010 +; GFX7-NEXT: v_mov_b32_e32 v8, s13 ; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40004 ; GFX7-NEXT: v_mov_b32_e32 v6, s15 ; GFX7-NEXT: s_lshr_b32 s10, s4, 28 ; GFX7-NEXT: v_mov_b32_e32 v4, s17 -; GFX7-NEXT: v_mul_u32_u24_e32 v4, s10, v4 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40014 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mul_u32_u24_e32 v10, s12, v2 +; GFX7-NEXT: v_mul_u32_u24_e32 v12, s10, v4 ; GFX7-NEXT: v_mul_u32_u24_e32 v6, s8, v6 -; GFX7-NEXT: v_mul_u32_u24_e32 v8, s6, v8 -; GFX7-NEXT: s_bfe_u32 s5, s5, 0x40010 +; GFX7-NEXT: v_mul_u32_u24_e32 v13, s6, v8 ; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40008 ; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_and_b32 s9, s4, 15 ; GFX7-NEXT: v_mov_b32_e32 v5, s16 ; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40018 ; GFX7-NEXT: v_mov_b32_e32 v3, s18 -; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40014 -; GFX7-NEXT: v_mov_b32_e32 v2, s19 -; GFX7-NEXT: v_mul_u32_u24_e32 v2, s12, v2 ; GFX7-NEXT: s_bfe_u32 s4, s4, 0x40010 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mul_u32_u24_e32 v3, s11, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_mul_u32_u24_e32 v9, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_mul_u32_u24_e32 v11, s11, v3 ; GFX7-NEXT: v_mul_u32_u24_e32 v5, s9, v5 -; GFX7-NEXT: v_mul_u32_u24_e32 v7, s7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX7-NEXT: v_mul_u32_u24_e32 v9, s4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v9, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v4, v5 -; GFX7-NEXT: v_alignbit_b32 v4, v2, v3, 8 -; GFX7-NEXT: v_alignbit_b32 v5, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_mul_u32_u24_e32 v7, s7, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v7, v13 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v7, v9, v7 +; GFX7-NEXT: v_alignbit_b32 v6, v7, v5, 8 +; GFX7-NEXT: v_alignbit_b32 v7, v7, v5, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v8, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2354,68 +2349,68 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40004 -; GFX8-NEXT: s_bfe_u32 s9, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40004 -; GFX8-NEXT: s_and_b32 s15, s2, 15 -; GFX8-NEXT: s_bfe_u32 s16, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX8-NEXT: s_lshr_b32 s5, s1, 28 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX8-NEXT: s_lshr_b32 s12, s2, 28 -; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40008 -; GFX8-NEXT: s_and_b32 s8, s1, 15 +; GFX8-NEXT: s_lshr_b32 s10, s2, 28 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x40004 +; GFX8-NEXT: s_bfe_u32 s14, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s15, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s16, s2, 0x40004 +; GFX8-NEXT: s_and_b32 s2, s2, 15 +; GFX8-NEXT: s_lshr_b32 s3, s1, 28 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s6, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s8, s1, 0x40008 +; GFX8-NEXT: s_and_b32 s1, s1, 15 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s1, v3 +; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v6, s15 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v7, s14 ; GFX8-NEXT: v_mov_b32_e32 v8, s7 -; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v6 -; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v8, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x40018 ; GFX8-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mov_b32_e32 v10, s12 ; GFX8-NEXT: v_mov_b32_e32 v11, s5 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v6 +; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v8, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX8-NEXT: v_mul_u32_u24_e32 v7, s6, v9 +; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v11, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v12, s11 ; GFX8-NEXT: v_mov_b32_e32 v13, s10 ; GFX8-NEXT: v_mov_b32_e32 v14, s3 -; GFX8-NEXT: v_mul_u32_u24_e32 v3, s1, v3 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX8-NEXT: v_mul_u32_u24_e32 v7, s6, v9 -; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v11, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v5 ; GFX8-NEXT: v_mul_u32_u24_e32 v9, s4, v12 ; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v4, s0, v7 ; GFX8-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v4, s0, v9 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v6, v4, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v9 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v10 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2431,57 +2426,57 @@ ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 -; GFX9-NEXT: s_lshr_b32 s13, s1, 28 -; GFX9-NEXT: s_and_b32 s14, s1, 15 -; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x4000c +; GFX9-NEXT: s_and_b32 s16, s1, 15 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_lshr_b32 s4, s0, 28 ; GFX9-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: s_lshr_b32 s6, s0, 28 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-NEXT: s_and_b32 s7, s0, 15 -; GFX9-NEXT: v_mov_b32_e32 v7, s14 -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v8, s15 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-NEXT: s_and_b32 s9, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v9, s16 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v10, s1 ; GFX9-NEXT: v_mul_lo_u16_e32 v3, s3, v3 ; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v5, s5, v5 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v9, s9, v9 +; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-NEXT: v_mov_b32_e32 v8, s15 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v5, v6 +; GFX9-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX9-NEXT: v_and_b32_e32 v6, s2, v6 ; GFX9-NEXT: v_mul_lo_u16_e32 v7, s7, v7 ; GFX9-NEXT: v_mul_lo_u16_sdwa v8, s8, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX9-NEXT: v_mul_lo_u16_e32 v9, s9, v9 -; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX9-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v8 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v7 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v9 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2497,57 +2492,57 @@ ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 28 -; GFX9-DL-NEXT: s_and_b32 s14, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-DL-NEXT: s_and_b32 s7, s0, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s14 -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s15 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s16 -; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v10, s1 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s3, v3 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s5, v5 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s9, v9 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s15 +; GFX9-DL-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-DL-NEXT: v_or_b32_e32 v4, v5, v6 +; GFX9-DL-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v6, s2, v6 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, s7, v7 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, s8, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s9, v9 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-DL-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v6 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v6, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v8 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v7 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v9 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v3 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2567,51 +2562,51 @@ ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 ; GFX10-DL-NEXT: s_and_b32 s4, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s6, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s5, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s2, s3 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s4, s6 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s5, s7 +; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s9, s1, 0x40008 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 -; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s3 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40010 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s7, s9 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 -; GFX10-DL-NEXT: s_lshr_b32 s8, s1, 28 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s3, s6 -; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s5, s6 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s8 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40010 +; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 28 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 ; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x40018 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s7 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s8 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 8, v7 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX10-DL-NEXT: v_or_b32_e32 v4, v6, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v5 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s5, s1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v8 -; GFX10-DL-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s3, s5 +; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 28 +; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x40018 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v7 +; GFX10-DL-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s3, s4 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s0, s1 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v7, v2, v3 +; GFX10-DL-NEXT: v_or_b32_e32 v3, v5, v4 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v7, v6 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v3 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1708,26 +1708,26 @@ ; ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; VI: ; %bb.0: +; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_mov_b32 s6, 0xffff +; VI-NEXT: s_mov_b32 s7, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_mov_b32 s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_and_b32 s2, s4, s0 -; VI-NEXT: s_mov_b32 s1, 0 -; VI-NEXT: s_lshl_b32 s3, s2, 16 +; VI-NEXT: s_lshl_b32 s1, s4, 16 +; VI-NEXT: s_and_b32 s3, s4, s6 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: s_or_b32 s0, s3, s1 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1] -; VI-NEXT: s_or_b32 s0, s2, s3 +; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[6:7] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_bfi_b32 v1, v5, s0, v1 ; VI-NEXT: v_bfi_b32 v0, v4, s0, v0 @@ -1807,19 +1807,19 @@ ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_mov_b32 s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_lshl_b32 s3, s4, 16 ; VI-NEXT: s_mov_b32 s1, 0 ; VI-NEXT: s_lshl_b32 s2, s5, 4 -; VI-NEXT: s_and_b32 s3, s4, s0 +; VI-NEXT: s_and_b32 s4, s4, s0 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; VI-NEXT: s_lshl_b32 s2, s3, 16 -; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_or_b32 s2, s4, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -38,8 +38,9 @@ ; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -85,8 +86,9 @@ ; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -105,13 +107,10 @@ ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -122,13 +121,10 @@ ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm bb: @@ -147,13 +143,10 @@ ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -83,11 +83,11 @@ ; GCN-LABEL: v_shl_i128_vk: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_alignbit_b32 v4, v2, v1, 15 +; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], 17 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 15, v1 ; GCN-NEXT: v_alignbit_b32 v1, v1, v0, 15 -; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 15 +; GCN-NEXT: v_or_b32_e32 v2, v2, v4 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 17, v0 -; GCN-NEXT: v_mov_b32_e32 v2, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl i128 %lhs, 17 ret i128 %shl @@ -110,11 +110,11 @@ ; GCN-LABEL: v_ashr_i128_vk: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashr_i64 v[4:5], v[2:3], 33 -; GCN-NEXT: v_alignbit_b32 v0, v2, v1, 1 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 1 -; GCN-NEXT: v_mov_b32_e32 v2, v4 -; GCN-NEXT: v_mov_b32_e32 v3, v5 +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_lshl_b64 v[0:1], v[2:3], 31 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], 33 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = ashr i128 %lhs, 33 ret i128 %shl diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -141,10 +141,11 @@ ; SI-LABEL: trunc_v2i64_arg_to_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v0, s4, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, s4, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: trunc_v2i64_arg_to_v2i16: diff --git a/llvm/test/CodeGen/ARM/and-load-combine.ll b/llvm/test/CodeGen/ARM/and-load-combine.ll --- a/llvm/test/CodeGen/ARM/and-load-combine.ll +++ b/llvm/test/CodeGen/ARM/and-load-combine.ll @@ -1433,12 +1433,9 @@ ; ; THUMB1-LABEL: test23: ; THUMB1: @ %bb.0: -; THUMB1-NEXT: ldrb r1, [r0, #3] -; THUMB1-NEXT: ldrb r0, [r0, #4] -; THUMB1-NEXT: lsls r0, r0, #8 -; THUMB1-NEXT: adds r1, r0, r1 -; THUMB1-NEXT: lsls r0, r1, #24 -; THUMB1-NEXT: lsrs r1, r1, #8 +; THUMB1-NEXT: ldrb r1, [r0, #4] +; THUMB1-NEXT: ldrb r0, [r0, #3] +; THUMB1-NEXT: lsls r0, r0, #24 ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: test23: diff --git a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll --- a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll @@ -91,18 +91,15 @@ ; BE-LABEL: i56_or: ; BE: @ %bb.0: ; BE-NEXT: mov r1, r0 -; BE-NEXT: ldr r12, [r0] +; BE-NEXT: ldr r0, [r0] ; BE-NEXT: ldrh r2, [r1, #4]! ; BE-NEXT: ldrb r3, [r1, #2] ; BE-NEXT: orr r2, r3, r2, lsl #8 -; BE-NEXT: orr r2, r2, r12, lsl #24 -; BE-NEXT: orr r2, r2, #384 -; BE-NEXT: strb r2, [r1, #2] -; BE-NEXT: lsr r3, r2, #8 -; BE-NEXT: strh r3, [r1] -; BE-NEXT: bic r1, r12, #255 -; BE-NEXT: orr r1, r1, r2, lsr #24 -; BE-NEXT: str r1, [r0] +; BE-NEXT: orr r0, r2, r0, lsl #24 +; BE-NEXT: orr r0, r0, #384 +; BE-NEXT: strb r0, [r1, #2] +; BE-NEXT: lsr r0, r0, #8 +; BE-NEXT: strh r0, [r1] ; BE-NEXT: mov pc, lr %aa = load i56, i56* %a %b = or i56 %aa, 384 @@ -121,19 +118,11 @@ ; ; BE-LABEL: i56_and_or: ; BE: @ %bb.0: -; BE-NEXT: mov r1, r0 -; BE-NEXT: ldr r12, [r0] -; BE-NEXT: ldrh r2, [r1, #4]! -; BE-NEXT: mov r3, #128 -; BE-NEXT: strb r3, [r1, #2] -; BE-NEXT: lsl r2, r2, #8 -; BE-NEXT: orr r2, r2, r12, lsl #24 -; BE-NEXT: orr r2, r2, #384 -; BE-NEXT: lsr r3, r2, #8 -; BE-NEXT: strh r3, [r1] -; BE-NEXT: bic r1, r12, #255 -; BE-NEXT: orr r1, r1, r2, lsr #24 -; BE-NEXT: str r1, [r0] +; BE-NEXT: ldrh r1, [r0, #4]! +; BE-NEXT: mov r2, #128 +; BE-NEXT: orr r1, r1, #1 +; BE-NEXT: strb r2, [r0, #2] +; BE-NEXT: strh r1, [r0] ; BE-NEXT: mov pc, lr %b = load i56, i56* %a, align 1 @@ -154,22 +143,13 @@ ; ; BE-LABEL: i56_insert_bit: ; BE: @ %bb.0: -; BE-NEXT: .save {r11, lr} -; BE-NEXT: push {r11, lr} -; BE-NEXT: mov r2, r0 -; BE-NEXT: ldr lr, [r0] -; BE-NEXT: ldrh r12, [r2, #4]! -; BE-NEXT: ldrb r3, [r2, #2] -; BE-NEXT: orr r12, r3, r12, lsl #8 -; BE-NEXT: orr r3, r12, lr, lsl #24 -; BE-NEXT: bic r3, r3, #8192 -; BE-NEXT: orr r1, r3, r1, lsl #13 -; BE-NEXT: lsr r3, r1, #8 -; BE-NEXT: strh r3, [r2] -; BE-NEXT: bic r2, lr, #255 -; BE-NEXT: orr r1, r2, r1, lsr #24 -; BE-NEXT: str r1, [r0] -; BE-NEXT: pop {r11, lr} +; BE-NEXT: ldrh r2, [r0, #4]! +; BE-NEXT: mov r3, #57088 +; BE-NEXT: orr r3, r3, #16711680 +; BE-NEXT: and r2, r3, r2, lsl #8 +; BE-NEXT: orr r1, r2, r1, lsl #13 +; BE-NEXT: lsr r1, r1, #8 +; BE-NEXT: strh r1, [r0] ; BE-NEXT: mov pc, lr %extbit = zext i1 %bit to i56 %b = load i56, i56* %a, align 1 diff --git a/llvm/test/CodeGen/ARM/ror.ll b/llvm/test/CodeGen/ARM/ror.ll --- a/llvm/test/CodeGen/ARM/ror.ll +++ b/llvm/test/CodeGen/ARM/ror.ll @@ -21,8 +21,14 @@ define <2 x i32> @test2(<2 x i32> %x) nounwind readnone { ; CHECK-LABEL: test2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ror r0, r0, #10 -; CHECK-NEXT: ror r1, r1, #10 +; CHECK-NEXT: bic r2, r0, #15 +; CHECK-NEXT: ror r0, r0, #4 +; CHECK-NEXT: lsr r0, r0, #6 +; CHECK-NEXT: orr r0, r0, r2, lsl #22 +; CHECK-NEXT: bic r2, r1, #15 +; CHECK-NEXT: ror r1, r1, #4 +; CHECK-NEXT: lsr r1, r1, #6 +; CHECK-NEXT: orr r1, r1, r2, lsl #22 ; CHECK-NEXT: bx lr entry: %high_part.i = shl <2 x i32> %x, diff --git a/llvm/test/CodeGen/ARM/uxtb.ll b/llvm/test/CodeGen/ARM/uxtb.ll --- a/llvm/test/CodeGen/ARM/uxtb.ll +++ b/llvm/test/CodeGen/ARM/uxtb.ll @@ -103,11 +103,12 @@ ; CHECK-LABEL: test10: ; CHECK: @ %bb.0: ; CHECK-NEXT: mov r1, #248 +; CHECK-NEXT: mov r2, #7 ; CHECK-NEXT: orr r1, r1, #16252928 -; CHECK-NEXT: and r0, r1, r0, lsr #7 -; CHECK-NEXT: lsr r1, r0, #5 -; CHECK-NEXT: uxtb16 r1, r1 -; CHECK-NEXT: orr r0, r1, r0 +; CHECK-NEXT: orr r2, r2, #458752 +; CHECK-NEXT: and r1, r1, r0, lsr #7 +; CHECK-NEXT: and r0, r2, r0, lsr #12 +; CHECK-NEXT: orr r0, r0, r1 ; CHECK-NEXT: bx lr %tmp1 = lshr i32 %p0, 7 %tmp2 = and i32 %tmp1, 16253176 diff --git a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll --- a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll +++ b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: nihh %r1, 4095 ; CHECK-NEXT: stg %r1, 0(%r2) ; CHECK-NEXT: vlgvf %r1, %v24, 2 -; CHECK-NEXT: risbgn %r0, %r0, 0, 129, 62 +; CHECK-NEXT: sllg %r0, %r0, 62 ; CHECK-NEXT: rosbg %r0, %r1, 2, 32, 31 ; CHECK-NEXT: vlgvf %r1, %v24, 3 ; CHECK-NEXT: rosbg %r0, %r1, 33, 63, 0 @@ -76,38 +76,41 @@ ; CHECK-NEXT: stmg %r14, %r15, 112(%r15) ; CHECK-NEXT: .cfi_offset %r14, -48 ; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: vlgvf %r0, %v26, 3 -; CHECK-NEXT: vlgvf %r4, %v24, 1 -; CHECK-NEXT: vlgvf %r3, %v24, 2 -; CHECK-NEXT: srlk %r1, %r0, 8 +; CHECK-NEXT: vlgvf %r1, %v26, 3 +; CHECK-NEXT: vlgvf %r0, %v26, 2 +; CHECK-NEXT: stc %r1, 30(%r2) +; CHECK-NEXT: srlk %r3, %r1, 8 +; CHECK-NEXT: risbgn %r1, %r1, 33, 167, 0 +; CHECK-NEXT: vlgvf %r5, %v24, 2 +; CHECK-NEXT: rosbg %r1, %r0, 2, 32, 31 +; CHECK-NEXT: sth %r3, 28(%r2) +; CHECK-NEXT: srlg %r1, %r1, 24 +; CHECK-NEXT: vlgvf %r3, %v24, 3 +; CHECK-NEXT: st %r1, 24(%r2) +; CHECK-NEXT: vlgvf %r1, %v26, 0 +; CHECK-NEXT: risbgn %r14, %r5, 6, 164, 27 +; CHECK-NEXT: sllg %r4, %r3, 60 +; CHECK-NEXT: rosbg %r14, %r3, 37, 63, 60 +; CHECK-NEXT: sllg %r3, %r14, 8 +; CHECK-NEXT: rosbg %r4, %r1, 4, 34, 29 +; CHECK-NEXT: rosbg %r3, %r4, 56, 63, 8 +; CHECK-NEXT: stg %r3, 8(%r2) +; CHECK-NEXT: vlgvf %r3, %v24, 1 +; CHECK-NEXT: sllg %r4, %r3, 58 +; CHECK-NEXT: rosbg %r4, %r5, 6, 36, 27 ; CHECK-NEXT: vlgvf %r5, %v24, 0 -; CHECK-NEXT: sth %r1, 28(%r2) -; CHECK-NEXT: risbgn %r1, %r4, 0, 133, 58 ; CHECK-NEXT: sllg %r5, %r5, 25 -; CHECK-NEXT: stc %r0, 30(%r2) -; CHECK-NEXT: rosbg %r1, %r3, 6, 36, 27 -; CHECK-NEXT: vlgvf %r3, %v24, 3 -; CHECK-NEXT: rosbg %r5, %r4, 39, 63, 58 -; CHECK-NEXT: sllg %r4, %r5, 8 -; CHECK-NEXT: rosbg %r1, %r3, 37, 63, 60 -; CHECK-NEXT: vlgvf %r5, %v26, 1 -; CHECK-NEXT: rosbg %r4, %r1, 56, 63, 8 -; CHECK-NEXT: stg %r4, 0(%r2) -; CHECK-NEXT: vlgvf %r4, %v26, 2 -; CHECK-NEXT: risbgn %r14, %r5, 0, 129, 62 -; CHECK-NEXT: risbgn %r3, %r3, 0, 131, 60 -; CHECK-NEXT: rosbg %r14, %r4, 2, 32, 31 -; CHECK-NEXT: rosbg %r14, %r0, 33, 63, 0 -; CHECK-NEXT: srlg %r0, %r14, 24 -; CHECK-NEXT: st %r0, 24(%r2) -; CHECK-NEXT: vlgvf %r0, %v26, 0 -; CHECK-NEXT: rosbg %r3, %r0, 4, 34, 29 -; CHECK-NEXT: sllg %r0, %r1, 8 -; CHECK-NEXT: rosbg %r3, %r5, 35, 63, 62 -; CHECK-NEXT: rosbg %r0, %r3, 56, 63, 8 -; CHECK-NEXT: stg %r0, 8(%r2) -; CHECK-NEXT: sllg %r0, %r3, 8 -; CHECK-NEXT: rosbg %r0, %r14, 56, 63, 8 +; CHECK-NEXT: rosbg %r5, %r3, 39, 63, 58 +; CHECK-NEXT: sllg %r3, %r5, 8 +; CHECK-NEXT: rosbg %r3, %r4, 56, 63, 8 +; CHECK-NEXT: stg %r3, 0(%r2) +; CHECK-NEXT: vlgvf %r3, %v26, 1 +; CHECK-NEXT: sllg %r4, %r3, 62 +; CHECK-NEXT: rosbg %r4, %r0, 2, 32, 31 +; CHECK-NEXT: risbgn %r0, %r1, 4, 162, 29 +; CHECK-NEXT: rosbg %r0, %r3, 35, 63, 62 +; CHECK-NEXT: sllg %r0, %r0, 8 +; CHECK-NEXT: rosbg %r0, %r4, 56, 63, 8 ; CHECK-NEXT: stg %r0, 16(%r2) ; CHECK-NEXT: lmg %r14, %r15, 112(%r15) ; CHECK-NEXT: br %r14 @@ -121,20 +124,20 @@ define void @fun3(<3 x i31>* %src, <3 x i31>* %p) ; CHECK-LABEL: fun3: ; CHECK: # %bb.0: -; CHECK-NEXT: l %r0, 8(%r2) -; CHECK-NEXT: lg %r1, 0(%r2) -; CHECK-NEXT: sllg %r2, %r1, 32 -; CHECK-NEXT: lr %r2, %r0 -; CHECK-NEXT: srlg %r0, %r2, 62 -; CHECK-NEXT: st %r2, 8(%r3) -; CHECK-NEXT: rosbg %r0, %r1, 33, 61, 34 -; CHECK-NEXT: sllg %r1, %r0, 62 -; CHECK-NEXT: rosbg %r1, %r2, 2, 32, 0 -; CHECK-NEXT: srlg %r1, %r1, 32 -; CHECK-NEXT: sllg %r0, %r0, 30 -; CHECK-NEXT: lr %r0, %r1 -; CHECK-NEXT: nihh %r0, 8191 -; CHECK-NEXT: stg %r0, 0(%r3) +; CHECK-NEXT: lg %r0, 0(%r2) +; CHECK-NEXT: llgf %r2, 8(%r2) +; CHECK-NEXT: sllg %r4, %r0, 32 +; CHECK-NEXT: lr %r4, %r2 +; CHECK-NEXT: risbgn %r4, %r4, 2, 160, 0 +; CHECK-NEXT: lgr %r5, %r4 +; CHECK-NEXT: rosbg %r4, %r0, 0, 1, 32 +; CHECK-NEXT: risbgn %r1, %r0, 3, 159, 0 +; CHECK-NEXT: rosbg %r5, %r2, 33, 63, 0 +; CHECK-NEXT: srlg %r0, %r4, 32 +; CHECK-NEXT: lr %r1, %r0 +; CHECK-NEXT: nihh %r1, 8191 +; CHECK-NEXT: st %r5, 8(%r3) +; CHECK-NEXT: stg %r1, 0(%r3) ; CHECK-NEXT: br %r14 { %tmp = load <3 x i31>, <3 x i31>* %src diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll @@ -179,7 +179,6 @@ ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: and r0, r0, #3 ; CHECK-LE-NEXT: sbfx r1, r0, #0, #1 ; CHECK-LE-NEXT: sbfx r0, r0, #1, #1 ; CHECK-LE-NEXT: vmov.32 q1[0], r1 @@ -194,7 +193,6 @@ ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: and r0, r0, #3 ; CHECK-BE-NEXT: sbfx r1, r0, #0, #1 ; CHECK-BE-NEXT: sbfx r0, r0, #1, #1 ; CHECK-BE-NEXT: vmov.32 q1[0], r1 diff --git a/llvm/test/CodeGen/Thumb2/thumb2-uxtb.ll b/llvm/test/CodeGen/Thumb2/thumb2-uxtb.ll --- a/llvm/test/CodeGen/Thumb2/thumb2-uxtb.ll +++ b/llvm/test/CodeGen/Thumb2/thumb2-uxtb.ll @@ -159,24 +159,14 @@ } define i32 @test10(i32 %p0) { -; CHECK-DSP-LABEL: test10: -; CHECK-DSP: @ %bb.0: -; CHECK-DSP-NEXT: mov.w r1, #16253176 -; CHECK-DSP-NEXT: and.w r0, r1, r0, lsr #7 -; CHECK-DSP-NEXT: lsrs r1, r0, #5 -; CHECK-DSP-NEXT: uxtb16 r1, r1 -; CHECK-DSP-NEXT: add r0, r1 -; CHECK-DSP-NEXT: bx lr -; -; CHECK-NO-DSP-LABEL: test10: -; CHECK-NO-DSP: @ %bb.0: -; CHECK-NO-DSP-NEXT: mov.w r1, #16253176 -; CHECK-NO-DSP-NEXT: and.w r0, r1, r0, lsr #7 -; CHECK-NO-DSP-NEXT: mov.w r1, #458759 -; CHECK-NO-DSP-NEXT: and.w r1, r1, r0, lsr #5 -; CHECK-NO-DSP-NEXT: add r0, r1 -; CHECK-NO-DSP-NEXT: bx lr - +; CHECK-LABEL: test10: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov.w r1, #16253176 +; CHECK-NEXT: mov.w r2, #458759 +; CHECK-NEXT: and.w r1, r1, r0, lsr #7 +; CHECK-NEXT: and.w r0, r2, r0, lsr #12 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: bx lr %tmp1 = lshr i32 %p0, 7 ; [#uses=1] %tmp2 = and i32 %tmp1, 16253176 ; [#uses=2] %tmp4 = lshr i32 %tmp2, 5 ; [#uses=1] diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll --- a/llvm/test/CodeGen/X86/ctpop-combine.ll +++ b/llvm/test/CodeGen/X86/ctpop-combine.ll @@ -88,20 +88,19 @@ ; ; NO-POPCOUNT-LABEL: test4: ; NO-POPCOUNT: # %bb.0: -; NO-POPCOUNT-NEXT: # kill: def $edi killed $edi def $rdi -; NO-POPCOUNT-NEXT: andb $127, %dil -; NO-POPCOUNT-NEXT: movl %edi, %eax -; NO-POPCOUNT-NEXT: shrb %al -; NO-POPCOUNT-NEXT: andb $21, %al -; NO-POPCOUNT-NEXT: subb %al, %dil ; NO-POPCOUNT-NEXT: movl %edi, %eax +; NO-POPCOUNT-NEXT: andb $127, %al +; NO-POPCOUNT-NEXT: shrb %dil +; NO-POPCOUNT-NEXT: andb $21, %dil +; NO-POPCOUNT-NEXT: subb %dil, %al +; NO-POPCOUNT-NEXT: movl %eax, %ecx +; NO-POPCOUNT-NEXT: andb $51, %cl +; NO-POPCOUNT-NEXT: shrb $2, %al ; NO-POPCOUNT-NEXT: andb $51, %al -; NO-POPCOUNT-NEXT: shrb $2, %dil -; NO-POPCOUNT-NEXT: andb $51, %dil -; NO-POPCOUNT-NEXT: addb %al, %dil -; NO-POPCOUNT-NEXT: movl %edi, %eax -; NO-POPCOUNT-NEXT: shrb $4, %al -; NO-POPCOUNT-NEXT: addl %edi, %eax +; NO-POPCOUNT-NEXT: addb %cl, %al +; NO-POPCOUNT-NEXT: movl %eax, %ecx +; NO-POPCOUNT-NEXT: shrb $4, %cl +; NO-POPCOUNT-NEXT: addl %ecx, %eax ; NO-POPCOUNT-NEXT: andb $15, %al ; NO-POPCOUNT-NEXT: # kill: def $al killed $al killed $eax ; NO-POPCOUNT-NEXT: retq diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll --- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -41,7 +41,7 @@ ; X86-NEXT: shll $16, %eax ; X86-NEXT: orl %edx, %eax ; X86-NEXT: orl $384, %eax # imm = 0x180 -; X86-NEXT: andl $16777088, %eax # imm = 0xFFFF80 +; X86-NEXT: andl $-128, %eax ; X86-NEXT: movw %ax, (%ecx) ; X86-NEXT: retl ; @@ -53,7 +53,7 @@ ; X64-NEXT: shll $16, %ecx ; X64-NEXT: orl %eax, %ecx ; X64-NEXT: orl $384, %ecx # imm = 0x180 -; X64-NEXT: andl $16777088, %ecx # imm = 0xFFFF80 +; X64-NEXT: andl $-128, %ecx ; X64-NEXT: movw %cx, (%rdi) ; X64-NEXT: retq %b = load i24, i24* %a, align 1 @@ -121,12 +121,11 @@ ; X64-NEXT: shll $16, %ecx ; X64-NEXT: orl %eax, %ecx ; X64-NEXT: shlq $32, %rcx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: orq %rcx, %rax -; X64-NEXT: orq $384, %rax # imm = 0x180 -; X64-NEXT: movl %eax, (%rdi) -; X64-NEXT: shrq $32, %rax +; X64-NEXT: movl (%rdi), %edx +; X64-NEXT: orq %rcx, %rdx +; X64-NEXT: orq $384, %rdx # imm = 0x180 ; X64-NEXT: movw %ax, 4(%rdi) +; X64-NEXT: movl %edx, (%rdi) ; X64-NEXT: retq %aa = load i56, i56* %a, align 1 %b = or i56 %aa, 384 @@ -191,15 +190,14 @@ ; X64-NEXT: shll $16, %edx ; X64-NEXT: orl %ecx, %edx ; X64-NEXT: shlq $32, %rdx -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: orq %rdx, %rcx +; X64-NEXT: movl (%rdi), %esi +; X64-NEXT: orq %rdx, %rsi ; X64-NEXT: shlq $13, %rax ; X64-NEXT: movabsq $72057594037919743, %rdx # imm = 0xFFFFFFFFFFDFFF -; X64-NEXT: andq %rcx, %rdx +; X64-NEXT: andq %rsi, %rdx ; X64-NEXT: orq %rax, %rdx +; X64-NEXT: movw %cx, 4(%rdi) ; X64-NEXT: movl %edx, (%rdi) -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: movw %dx, 4(%rdi) ; X64-NEXT: retq %extbit = zext i1 %bit to i56 %b = load i56, i56* %a, align 1 diff --git a/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll b/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll --- a/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll +++ b/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll @@ -5,8 +5,9 @@ ; CHECK-LABEL: t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movzwl 0, %eax -; CHECK-NEXT: orl $2, %eax -; CHECK-NEXT: movw %ax, 0 +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: orl $2, %ecx +; CHECK-NEXT: movw %cx, 0 ; CHECK-NEXT: shrl $3, %eax ; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -519,21 +519,20 @@ ; AVX512F-NEXT: kshiftrw $8, %k1, %k2 ; AVX512F-NEXT: vcompresspd %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: movzbl %al, %eax -; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl %ecx -; AVX512F-NEXT: andl $-43, %ecx -; AVX512F-NEXT: subl %ecx, %eax -; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333 -; AVX512F-NEXT: shrl $2, %eax +; AVX512F-NEXT: movzbl %al, %ecx +; AVX512F-NEXT: shrl %eax +; AVX512F-NEXT: andl $85, %eax +; AVX512F-NEXT: subl %eax, %ecx +; AVX512F-NEXT: movl %ecx, %eax ; AVX512F-NEXT: andl $858993459, %eax ## imm = 0x33333333 -; AVX512F-NEXT: addl %ecx, %eax -; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $4, %ecx +; AVX512F-NEXT: shrl $2, %ecx +; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333 ; AVX512F-NEXT: addl %eax, %ecx -; AVX512F-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F -; AVX512F-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 +; AVX512F-NEXT: movl %ecx, %eax +; AVX512F-NEXT: shrl $4, %eax +; AVX512F-NEXT: addl %ecx, %eax +; AVX512F-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F +; AVX512F-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101 ; AVX512F-NEXT: shrl $24, %eax ; AVX512F-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} ; AVX512F-NEXT: vzeroupper @@ -573,21 +572,20 @@ ; AVX512VLBW-NEXT: kshiftrw $8, %k1, %k2 ; AVX512VLBW-NEXT: vcompresspd %zmm0, (%rdi) {%k1} ; AVX512VLBW-NEXT: kmovd %k1, %eax -; AVX512VLBW-NEXT: movzbl %al, %eax -; AVX512VLBW-NEXT: movl %eax, %ecx -; AVX512VLBW-NEXT: shrl %ecx -; AVX512VLBW-NEXT: andl $-43, %ecx -; AVX512VLBW-NEXT: subl %ecx, %eax -; AVX512VLBW-NEXT: movl %eax, %ecx -; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333 -; AVX512VLBW-NEXT: shrl $2, %eax +; AVX512VLBW-NEXT: movzbl %al, %ecx +; AVX512VLBW-NEXT: shrl %eax +; AVX512VLBW-NEXT: andl $85, %eax +; AVX512VLBW-NEXT: subl %eax, %ecx +; AVX512VLBW-NEXT: movl %ecx, %eax ; AVX512VLBW-NEXT: andl $858993459, %eax ## imm = 0x33333333 -; AVX512VLBW-NEXT: addl %ecx, %eax -; AVX512VLBW-NEXT: movl %eax, %ecx -; AVX512VLBW-NEXT: shrl $4, %ecx +; AVX512VLBW-NEXT: shrl $2, %ecx +; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333 ; AVX512VLBW-NEXT: addl %eax, %ecx -; AVX512VLBW-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F -; AVX512VLBW-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 +; AVX512VLBW-NEXT: movl %ecx, %eax +; AVX512VLBW-NEXT: shrl $4, %eax +; AVX512VLBW-NEXT: addl %ecx, %eax +; AVX512VLBW-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F +; AVX512VLBW-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101 ; AVX512VLBW-NEXT: shrl $24, %eax ; AVX512VLBW-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} ; AVX512VLBW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll --- a/llvm/test/CodeGen/X86/mul128.ll +++ b/llvm/test/CodeGen/X86/mul128.ll @@ -106,13 +106,14 @@ define void @PR13897() nounwind { ; X64-LABEL: PR13897: ; X64: # %bb.0: # %"0x0" -; X64-NEXT: movl {{.*}}(%rip), %ecx +; X64-NEXT: movq {{.*}}(%rip), %rsi +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: shlq $32, %rsi ; X64-NEXT: movabsq $4294967297, %rdx # imm = 0x100000001 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rdx ; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: shlq $32, %rcx -; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: addq %rsi, %rdx ; X64-NEXT: movq %rax, {{.*}}(%rip) ; X64-NEXT: movq %rdx, aaa+{{.*}}(%rip) ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shift-mask.ll b/llvm/test/CodeGen/X86/shift-mask.ll --- a/llvm/test/CodeGen/X86/shift-mask.ll +++ b/llvm/test/CodeGen/X86/shift-mask.ll @@ -555,10 +555,11 @@ ; X86-LABEL: test_i64_lshr_lshr_1: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: shldl $3, %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (,%ecx,8), %edx +; X86-NEXT: shldl $3, %eax, %ecx ; X86-NEXT: shll $3, %eax -; X86-NEXT: shrdl $5, %edx, %eax +; X86-NEXT: shrdl $5, %ecx, %eax ; X86-NEXT: shrl $5, %edx ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -297,15 +297,14 @@ ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl %cx, %ecx -; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrl $16, %edx -; X86-NEXT: shll $16, %ecx +; X86-NEXT: shll $17, %edx +; X86-NEXT: shrl $15, %ecx +; X86-NEXT: andl $1, %ecx ; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax -; X86-NEXT: pushl %edx ; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %edx ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: cmpl $131071, %eax # imm = 0x1FFFF diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll --- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll +++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll @@ -57,15 +57,15 @@ ; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000 ; CHECK-NEXT: cmovll %ecx, %edx ; CHECK-NEXT: pextrw $1, %xmm0, %esi -; CHECK-NEXT: movswl %si, %edi -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shrl $15, %eax -; CHECK-NEXT: leal (%rdi,%rdi), %esi -; CHECK-NEXT: shrdw $15, %ax, %si -; CHECK-NEXT: sarl $15, %edi -; CHECK-NEXT: cmpl $16383, %edi # imm = 0x3FFF +; CHECK-NEXT: leal (%rsi,%rsi), %edi +; CHECK-NEXT: movswl %si, %eax +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: shrl $15, %esi +; CHECK-NEXT: shldw $1, %di, %si +; CHECK-NEXT: sarl $15, %eax +; CHECK-NEXT: cmpl $16383, %eax # imm = 0x3FFF ; CHECK-NEXT: cmovgl %r8d, %esi -; CHECK-NEXT: cmpl $-16384, %edi # imm = 0xC000 +; CHECK-NEXT: cmpl $-16384, %eax # imm = 0xC000 ; CHECK-NEXT: cmovll %ecx, %esi ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: cwtl @@ -82,11 +82,11 @@ ; CHECK-NEXT: pinsrw $1, %esi, %xmm1 ; CHECK-NEXT: pinsrw $2, %edx, %xmm1 ; CHECK-NEXT: pextrw $3, %xmm0, %eax +; CHECK-NEXT: leal (,%rax,4), %edx ; CHECK-NEXT: cwtl -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: shrl $14, %edx -; CHECK-NEXT: leal (,%rax,4), %esi -; CHECK-NEXT: shrdw $15, %dx, %si +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: shrl $14, %esi +; CHECK-NEXT: shldw $1, %dx, %si ; CHECK-NEXT: sarl $14, %eax ; CHECK-NEXT: cmpl $16383, %eax # imm = 0x3FFF ; CHECK-NEXT: cmovgl %r8d, %esi