Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -750,14 +750,6 @@ /// \return true if stores were merged. bool mergeConsecutiveStores(StoreSDNode *St); - /// Try to transform a truncation where C is a constant: - /// (trunc (and X, C)) -> (and (trunc X), (trunc C)) - /// - /// \p N needs to be a truncation and its first operand an AND. Other - /// requirements are checked by the function (e.g. that trunc is - /// single-use) and if missed an empty SDValue is returned. - SDValue distributeTruncateThroughAnd(SDNode *N); - /// Helper function to determine whether the target supports operation /// given by \p Opcode for type \p VT, that is, whether the operation /// is legal or custom before legalizing operations, and whether is @@ -8035,29 +8027,6 @@ return DAG.getNode(LHS.getOpcode(), DL, VT, NewShift, NewRHS); } -SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) { - assert(N->getOpcode() == ISD::TRUNCATE); - assert(N->getOperand(0).getOpcode() == ISD::AND); - - // (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC) - EVT TruncVT = N->getValueType(0); - if (N->hasOneUse() && N->getOperand(0).hasOneUse() && - TLI.isTypeDesirableForOp(ISD::AND, TruncVT)) { - SDValue N01 = N->getOperand(0).getOperand(1); - if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) { - SDLoc DL(N); - SDValue N00 = N->getOperand(0).getOperand(0); - SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00); - SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01); - AddToWorklist(Trunc00.getNode()); - AddToWorklist(Trunc01.getNode()); - return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01); - } - } - - return SDValue(); -} - SDValue DAGCombiner::visitRotate(SDNode *N) { SDLoc dl(N); SDValue N0 = N->getOperand(0); @@ -8100,13 +8069,6 @@ if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); - // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))). - if (N1.getOpcode() == ISD::TRUNCATE && - N1.getOperand(0).getOpcode() == ISD::AND) { - if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) - return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1); - } - unsigned NextOp = N0.getOpcode(); // fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize) if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) { @@ -8178,13 +8140,6 @@ APInt::getAllOnesValue(OpSizeInBits))) return DAG.getConstant(0, SDLoc(N), VT); - // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))). - if (N1.getOpcode() == ISD::TRUNCATE && - N1.getOperand(0).getOpcode() == ISD::AND) { - if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) - return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1); - } - if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); @@ -8602,13 +8557,6 @@ } } - // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))). - if (N1.getOpcode() == ISD::TRUNCATE && - N1.getOperand(0).getOpcode() == ISD::AND) { - if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) - return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1); - } - // fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2)) // fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2)) // if c1 is equal to the number of bits the trunc removes @@ -8826,13 +8774,6 @@ } } - // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))). - if (N1.getOpcode() == ISD::TRUNCATE && - N1.getOperand(0).getOpcode() == ISD::AND) { - if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) - return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1); - } - // fold operands of srl based on knowledge that the low bits are not // demanded. if (SimplifyDemandedBits(SDValue(N, 0))) @@ -12258,7 +12199,7 @@ case ISD::AND: case ISD::OR: case ISD::XOR: - if (!LegalOperations && N0.hasOneUse() && + if ((!LegalOperations || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) && N0.hasOneUse() && (isConstantOrConstantVector(N0.getOperand(0), true) || isConstantOrConstantVector(N0.getOperand(1), true))) { // TODO: We already restricted this to pre-legalization, but for vectors Index: llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll =================================================================== --- llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll +++ llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll @@ -305,8 +305,8 @@ ; CHECK-CVT: fcvtl ; CHECK-CVT: fcvtl ; CHECK-CVT: fcmeq -; CHECK-CVT: mvn ; CHECK-CVT: xtn +; CHECK-CVT: mvn ; CHECK-CVT: ret ; CHECK-FP16-LABEL: test_fcmp_une: Index: llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll +++ llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll @@ -97,10 +97,10 @@ ; CHECK-NEXT: and v0.16b, v3.16b, v0.16b ; CHECK-NEXT: cmeq v0.2d, v0.2d, v1.2d ; CHECK-NEXT: cmeq v1.2d, v4.2d, v2.2d -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: xtn v1.2s, v1.2d +; CHECK-NEXT: mvn v0.8b, v0.8b +; CHECK-NEXT: mvn v1.8b, v1.8b ; CHECK-NEXT: mov w1, v0.s[1] ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: fmov w2, s1 Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s @@ -4668,45 +4667,45 @@ ; GFX9-NEXT: s_movk_i32 s8, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b32 s0, s6, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX9-NEXT: s_and_b32 s0, s4, s8 -; GFX9-NEXT: s_and_b32 s1, s6, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 ; GFX9-NEXT: s_bfe_u32 s0, s6, 0xf000f +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f -; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: s_bfe_u32 s0, s4, 0xf000f +; GFX9-NEXT: v_alignbit_b32 v5, s7, v5, 30 +; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GFX9-NEXT: v_and_b32_e32 v5, s8, v5 +; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v0 +; GFX9-NEXT: v_alignbit_b32 v1, s5, v1, 30 +; GFX9-NEXT: v_mul_f32_e32 v0, v7, v8 +; GFX9-NEXT: v_and_b32_e32 v1, s8, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mad_f32 v4, -v0, v6, v7 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc +; GFX9-NEXT: v_and_b32_e32 v4, s8, v4 +; GFX9-NEXT: v_mul_f32_e32 v0, v1, v7 +; GFX9-NEXT: v_trunc_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_mad_f32 v0, -v0, v5, v1 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v5 ; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 -; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 -; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, v0, v7 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1 -; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 -; GFX9-NEXT: v_and_b32_e32 v3, s8, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v4, s8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 @@ -4866,55 +4865,55 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 -; GFX9-NEXT: s_and_b32 s5, s6, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s0, s6, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX9-NEXT: s_and_b32 s0, s4, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GFX9-NEXT: s_bfe_u32 s5, s6, 0xf000f -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 +; GFX9-NEXT: s_bfe_u32 s1, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: v_mad_f32 v3, -v4, v1, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 +; GFX9-NEXT: v_alignbit_b32 v5, s7, v5, 30 +; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_and_b32_e32 v5, s8, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: v_mad_f32 v4, -v3, v6, v7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v5 +; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 ; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 -; GFX9-NEXT: s_lshr_b32 s0, s6, 15 -; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v7 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: s_lshr_b32 s0, s6, 15 +; GFX9-NEXT: v_mul_f32_e32 v4, v8, v9 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mad_f32 v7, -v4, v6, v7 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v6, v8, v9 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_mad_f32 v4, -v4, v7, v8 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s0 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, v5 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 ; GFX9-NEXT: s_lshr_b32 s0, s4, 15 -; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 -; GFX9-NEXT: v_and_b32_e32 v4, s8, v4 +; GFX9-NEXT: v_sub_u32_e32 v3, s0, v3 +; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 ; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 +; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_and_b32_e32 v3, s8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_and_b32_e32 v4, s8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 @@ -5083,60 +5082,60 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s4, 0xf0000 ; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 ; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 -; GFX9-NEXT: s_or_b32 s5, s0, 1 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: s_or_b32 s8, s0, 1 +; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cselect_b32 s0, s5, 0 -; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX9-NEXT: s_cselect_b32 s0, s8, 0 ; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX9-NEXT: s_bfe_i32 s8, s4, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s8 +; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v3 +; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 +; GFX9-NEXT: s_xor_b32 s0, s8, s1 +; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 +; GFX9-NEXT: s_or_b32 s8, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 -; GFX9-NEXT: s_bfe_i32 s0, s4, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 ; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 -; GFX9-NEXT: s_or_b32 s4, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX9-NEXT: v_add_u32_e32 v5, s0, v6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v1 -; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX9-NEXT: v_mad_f32 v1, -v1, v5, v6 +; GFX9-NEXT: s_cselect_b32 s0, s8, 0 +; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v5| ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 ; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 -; GFX9-NEXT: v_and_b32_e32 v3, s0, v4 -; GFX9-NEXT: v_and_b32_e32 v4, s0, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 +; GFX9-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] @@ -5324,53 +5323,53 @@ ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: s_movk_i32 s8, 0x7fff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, s8 -; GFX9-NEXT: s_and_b32 s1, s6, s8 -; GFX9-NEXT: s_bfe_i32 s1, s1, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX9-NEXT: s_and_b32 s1, s4, s8 +; GFX9-NEXT: s_and_b32 s0, s6, s8 ; GFX9-NEXT: s_bfe_i32 s0, s0, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_bfe_i32 s1, s1, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 -; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 -; GFX9-NEXT: s_or_b32 s11, s0, 1 ; GFX9-NEXT: s_lshr_b32 s9, s4, 15 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0xf000f -; GFX9-NEXT: s_lshr_b32 s7, s6, 15 -; GFX9-NEXT: s_bfe_u32 s10, s6, 0xf000f -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| +; GFX9-NEXT: s_lshr_b32 s10, s6, 15 +; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX9-NEXT: s_or_b32 s11, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cselect_b32 s0, s11, 0 -; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 -; GFX9-NEXT: s_bfe_i32 s0, s10, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GFX9-NEXT: s_bfe_i32 s1, s5, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-NEXT: s_bfe_u32 s0, s6, 0xf000f +; GFX9-NEXT: s_bfe_i32 s0, s0, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f +; GFX9-NEXT: s_bfe_i32 s1, s1, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v1 ; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s5, s0, 1 -; GFX9-NEXT: v_and_b32_e32 v1, s8, v1 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| +; GFX9-NEXT: s_or_b32 s11, s0, 1 +; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_mad_f32 v2, -v3, v1, v2 +; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v1| ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cselect_b32 s0, s5, 0 -; GFX9-NEXT: v_bfe_i32 v4, v1, 0, 15 -; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 +; GFX9-NEXT: s_cselect_b32 s0, s11, 0 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 +; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 +; GFX9-NEXT: v_bfe_i32 v4, v3, 0, 15 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, v4 -; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX9-NEXT: v_bfe_i32 v6, v0, 0, 15 +; GFX9-NEXT: v_alignbit_b32 v2, s5, v2, 30 +; GFX9-NEXT: v_and_b32_e32 v2, s8, v2 +; GFX9-NEXT: v_bfe_i32 v6, v2, 0, 15 ; GFX9-NEXT: v_cvt_f32_i32_e32 v7, v6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX9-NEXT: v_xor_b32_e32 v4, v6, v4 @@ -5382,19 +5381,19 @@ ; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| ; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s10 ; GFX9-NEXT: v_add_u32_e32 v4, v8, v4 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s6 -; GFX9-NEXT: v_mul_lo_u32 v1, v4, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 -; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 -; GFX9-NEXT: v_sub_u32_e32 v2, s4, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 +; GFX9-NEXT: v_mul_lo_u32 v3, v4, v3 +; GFX9-NEXT: v_sub_u32_e32 v6, s9, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_sub_u32_e32 v5, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v3, s8, v6 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_and_b32_e32 v2, s8, v2 +; GFX9-NEXT: v_and_b32_e32 v2, s8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX9-NEXT: global_store_dword v4, v0, s[2:3] ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 Index: llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -117,22 +117,78 @@ ; For GFX8: since i16 is legal type, we cannot sink lshr into BBs. -; GCN-LABEL: {{^}}sink_ubfe_i16: -; GCN-NOT: lshr -; VI: s_load_dword [[ARG:s[0-9]+]], s[0:1], 0x2c -; VI: s_bfe_u32 [[BFE:s[0-9]+]], [[ARG]], 0xc0004 -; GCN: s_cbranch_scc{{[0-1]}} - -; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 -; VI: v_mov_b32_e32 v{{[0-9]+}}, 0x7f - -; GCN: BB2_3: -; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 -; VI: v_mov_b32_e32 v{{[0-9]+}}, 0xff - -; GCN: buffer_store_short -; GCN: s_endpgm define amdgpu_kernel void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 { +; SI-LABEL: sink_ubfe_i16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s1, s[0:1], 0xb +; SI-NEXT: s_cbranch_scc0 BB2_2 +; SI-NEXT: ; %bb.1: ; %bb1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfe_u32 s0, s1, 0x70004 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz BB2_3 +; SI-NEXT: s_branch BB2_4 +; SI-NEXT: BB2_2: +; SI-NEXT: s_mov_b64 s[2:3], -1 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 vcc, vcc +; SI-NEXT: s_cbranch_vccnz BB2_4 +; SI-NEXT: BB2_3: ; %bb0 +; SI-NEXT: s_bfe_u32 s0, s1, 0x80004 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: BB2_4: ; %ret +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: sink_ubfe_i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: v_mov_b32_e32 v0, 0xfff +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s0, s0, 4 +; VI-NEXT: v_and_b32_e32 v1, s0, v0 +; VI-NEXT: s_cbranch_scc0 BB2_2 +; VI-NEXT: ; %bb.1: ; %bb1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: buffer_store_short v2, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v0, 0x7f, v1 +; VI-NEXT: s_cbranch_execz BB2_3 +; VI-NEXT: s_branch BB2_4 +; VI-NEXT: BB2_2: +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: BB2_3: ; %bb0 +; VI-NEXT: v_and_b32_e32 v0, 0xff, v1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: BB2_4: ; %ret +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm entry: %shr = lshr i16 %arg1, 4 br i1 undef, label %bb0, label %bb1 Index: llvm/test/CodeGen/AMDGPU/idot8s.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot8s.ll +++ llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2357,77 +2357,86 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 15 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-NEXT: global_load_dword v4, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 12, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v10, 15, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v17, 15, v4 -; GFX9-NEXT: v_bfe_u32 v0, v3, 24, 4 -; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX9-NEXT: v_bfe_u32 v13, v4, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v4, 8, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX9-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX9-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX9-NEXT: v_bfe_u32 v3, v3, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v10, v2, v10 -; GFX9-NEXT: v_bfe_u32 v11, v4, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v4 -; GFX9-NEXT: v_bfe_u32 v14, v4, 20, 4 -; GFX9-NEXT: v_bfe_u32 v16, v4, 12, 4 -; GFX9-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v17, v2, v17 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v17 -; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, v3 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX9-NEXT: global_load_ushort v4, v1, s[2:3] -; GFX9-NEXT: v_and_b32_e32 v8, v2, v8 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 4, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX9-NEXT: v_and_b32_e32 v16, 15, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v18, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 20, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-NEXT: v_and_b32_sdwa v19, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v5, 15, v10 +; GFX9-NEXT: v_and_b32_e32 v10, v4, v16 +; GFX9-NEXT: v_and_b32_e32 v16, v4, v17 +; GFX9-NEXT: v_and_b32_e32 v17, v4, v18 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v6, v15, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-NEXT: v_and_b32_e32 v8, v4, v8 +; GFX9-NEXT: v_and_b32_e32 v18, v4, v19 +; GFX9-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-NEXT: v_and_b32_e32 v4, v4, v13 ; GFX9-NEXT: v_lshl_or_b32 v8, v9, 16, v8 -; GFX9-NEXT: v_lshl_or_b32 v5, v16, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v6, v2, v6 +; GFX9-NEXT: v_lshl_or_b32 v4, v14, 16, v4 +; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 ; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-NEXT: v_and_b32_e32 v13, v2, v13 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v11 -; GFX9-NEXT: v_lshl_or_b32 v6, v7, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; GFX9-NEXT: v_lshl_or_b32 v2, v12, 16, v2 +; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v2, v11, 16, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, v5, v3 +; GFX9-NEXT: v_lshl_or_b32 v10, v12, 16, v18 ; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v9, 12, v10 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v8, v5 -; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v6, v7 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v4, v3, v4 -; GFX9-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v8, v4 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v7, v9 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v2, v2, v0 -; GFX9-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16_vecMul: @@ -2440,77 +2449,86 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v4, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(2) +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 12, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v3 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v4 -; GFX9-DL-NEXT: v_bfe_u32 v0, v3, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v4, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v4, 8, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX9-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v3, v3, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v10, v2, v10 -; GFX9-DL-NEXT: v_bfe_u32 v11, v4, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v4, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v4, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v17, v2, v17 -; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v10 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v17 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, v3 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] -; GFX9-DL-NEXT: v_and_b32_e32 v8, v2, v8 -; GFX9-DL-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-DL-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX9-DL-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 4, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 15, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v18, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 20, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v19, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v10, v4, v16 +; GFX9-DL-NEXT: v_and_b32_e32 v16, v4, v17 +; GFX9-DL-NEXT: v_and_b32_e32 v17, v4, v18 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-DL-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v10 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v15, 16, v17 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-DL-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v8, v4, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v18, v4, v19 +; GFX9-DL-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v13 ; GFX9-DL-NEXT: v_lshl_or_b32 v8, v9, 16, v8 -; GFX9-DL-NEXT: v_lshl_or_b32 v5, v16, 16, v15 -; GFX9-DL-NEXT: v_and_b32_e32 v6, v2, v6 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v14, 16, v4 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_and_b32_e32 v13, v2, v13 -; GFX9-DL-NEXT: v_and_b32_e32 v2, v2, v11 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v7, 16, v6 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v12, 16, v2 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 +; GFX9-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v16 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v11, 16, v2 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_add_u16_e32 v3, v5, v3 +; GFX9-DL-NEXT: v_lshl_or_b32 v10, v12, 16, v18 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v9, 12, v10 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v8, v5 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v6, v7 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v4, v3, v4 -; GFX9-DL-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v8, v4 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v7, v9 +; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v0 -; GFX9-DL-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul: @@ -2518,7 +2536,8 @@ ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v23, 0xffff +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v5, 15 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 @@ -2527,76 +2546,84 @@ ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] +; GFX10-DL-XNACK-NEXT: global_load_dword v19, v0, s[4:5] ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v11, 15, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v19 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, 15, v2 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v10, v1, 12, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v11, v4, v11 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v16, v2, 4, 4 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v18, v2, 8, 4 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v19, v2, 24, 4 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v17, v2, 20, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v2, v2, 12, 4 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v18 -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v9, v10, 16, v9 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v8, 16, v7 -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v2, 16, v13 -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 4, v2 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v16, 15, v19 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v18, 15, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v19 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 20, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX10-DL-XNACK-NEXT: v_and_b32_sdwa v17, v19, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_and_b32_sdwa v1, v19, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX10-DL-XNACK-NEXT: v_and_b32_sdwa v22, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_and_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v5, 15, v10 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v23, v16 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v16, v23, v18 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 12, v19 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v10 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v10, v15, 16, v16 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v8, v23, v8 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v10, 12, v10 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v23, v13 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v8, v9, 16, v8 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 20, v19 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v9, v14, 16, v13 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v17, v23, v17 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v10 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v15 -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v23, v22 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v17 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v8, v17, 16, v10 -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v1, v1, v3 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, v4, v5 -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, v4, v19 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v9, v2 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v1, v1, v10 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v3, v5, v3 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v10, v12, 16, v13 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, v23, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v19 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, v23, v1 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v10 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v8, v9 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v3, v3, v12 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v11, 16, v2 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v14, 16, v4 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v1, v1, v2 -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v3 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v3, v3, v4 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v1, v1, v6 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v3, v3, v6 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v7, v1, v4 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v3 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v1, v7, v5 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v3, v3, v4 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v2, v3, v5 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v1, v2, v1 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v1, v1, v3 ; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm @@ -2607,7 +2634,8 @@ ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v23, 0xffff +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v5, 15 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 @@ -2616,75 +2644,83 @@ ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] +; GFX10-DL-NOXNACK-NEXT: global_load_dword v19, v0, s[4:5] ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v11, 15, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v19 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, 15, v0 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v10, v1, 12, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v11, v4, v11 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v16, v0, 4, 4 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v18, v0, 8, 4 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v19, v0, 24, 4 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v15, v0, 16, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v17, v0, 20, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v0, v0, 12, 4 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v18 -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v9, v10, 16, v9 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v8, 16, v7 -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v0, 16, v13 -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 4, v0 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v16, 15, v19 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v18, 15, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v19 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 20, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 8, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_sdwa v17, v19, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_and_b32_sdwa v1, v19, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_sdwa v22, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v5, 15, v10 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v23, v16 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v16, v23, v18 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 12, v19 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v10 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v10, v15, 16, v16 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v8, v23, v8 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v10, 12, v10 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v23, v13 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v8, v9, 16, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 20, v19 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v9, v14, 16, v13 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v17, v23, v17 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v10 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v15 -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v23, v22 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v17 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v8, v17, 16, v10 -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v1, v1, v3 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, v4, v5 -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, v4, v19 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v9, v0 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v1, v1, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v3, v5, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v10, v12, 16, v13 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, v23, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v19 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, v23, v1 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v10 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v8, v9 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v3, v3, v12 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v11, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v14, 16, v4 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v0, v1, v0 -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v3 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v3, v3, v4 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v7, v0, v6 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v3, v3, v6 ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v0, v7, v4 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v0, v0, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v0, v0, v1 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v3, v3, v4 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v1, v3, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v0, v1, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v0, v0, v3 ; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/idot8u.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot8u.ll +++ llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -335,8 +335,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -344,42 +344,51 @@ ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_ushort v18, v[2:3] +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v1, v[2:3] +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v4 -; GFX8-NEXT: v_bfe_u32 v5, v4, 24, 4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 20, 4 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 4 -; GFX8-NEXT: v_bfe_u32 v8, v4, 12, 4 -; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4 -; GFX8-NEXT: v_bfe_u32 v10, v4, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v4 +; GFX8-NEXT: v_and_b32_e32 v16, 15, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_and_b32_e32 v17, 15, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v4 +; GFX8-NEXT: v_and_b32_sdwa v18, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v0 -; GFX8-NEXT: v_bfe_u32 v12, v0, 24, 4 -; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v14, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v15, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v16, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v17, v0, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_and_b32_sdwa v19, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v5, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v15 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v18 -; GFX8-NEXT: v_mad_u16 v0, v10, v17, v0 -; GFX8-NEXT: v_mad_u16 v0, v9, v16, v0 -; GFX8-NEXT: v_mad_u16 v0, v8, v15, v0 -; GFX8-NEXT: v_mad_u16 v0, v7, v14, v0 -; GFX8-NEXT: v_mad_u16 v0, v6, v13, v0 -; GFX8-NEXT: v_mad_u16 v0, v5, v12, v0 -; GFX8-NEXT: v_mad_u16 v0, v1, v11, v0 +; GFX8-NEXT: v_mad_u16 v1, v16, v17, v1 +; GFX8-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX8-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX8-NEXT: v_mad_u16 v1, v5, v10, v1 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX8-NEXT: v_mad_u16 v1, v9, v14, v1 +; GFX8-NEXT: v_mad_u16 v1, v8, v13, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX8-NEXT: v_mad_u16 v1, v18, v19, v1 +; GFX8-NEXT: v_mad_u16 v1, v7, v12, v1 +; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1 +; GFX8-NEXT: v_mad_u16 v0, v6, v11, v0 ; GFX8-NEXT: flat_store_short v[2:3], v0 ; GFX8-NEXT: s_endpgm ; @@ -398,34 +407,43 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v17, v1, s[2:3] +; GFX9-NEXT: global_load_ushort v4, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v3 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v2 +; GFX9-NEXT: v_and_b32_e32 v16, 15, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v2 +; GFX9-NEXT: v_and_b32_sdwa v17, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-NEXT: v_bfe_u32 v11, v3, 24, 4 -; GFX9-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_and_b32_sdwa v18, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v3, 15, v9 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v14 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 +; GFX9-NEXT: v_mad_legacy_u16 v4, v15, v16, v4 +; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-NEXT: v_mad_legacy_u16 v3, v3, v9, v4 +; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 +; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-NEXT: v_mad_legacy_u16 v3, v17, v18, v3 +; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 +; GFX9-NEXT: v_mad_legacy_u16 v0, v2, v0, v3 +; GFX9-NEXT: v_mad_legacy_u16 v0, v5, v10, v0 ; GFX9-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -444,34 +462,43 @@ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ushort v17, v1, s[2:3] +; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 15, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v17, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-DL-NEXT: v_bfe_u32 v11, v3, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_and_b32_sdwa v18, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v14 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 +; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v15, v16, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v3, v9, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v17, v18, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v0, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v5, v10, v0 ; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -486,41 +513,50 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] +; GFX10-DL-NEXT: global_load_dword v11, v0, s[4:5] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v11 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v11 +; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4 +; GFX10-DL-NEXT: v_mad_u16 v3, v6, v13, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 12, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v6 +; GFX10-DL-NEXT: v_mov_b32_e32 v5, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v7 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v11 +; GFX10-DL-NEXT: v_mad_u16 v3, v8, v9, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 20, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v9, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v11 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v8 +; GFX10-DL-NEXT: v_mad_u16 v3, v9, v10, v3 +; GFX10-DL-NEXT: v_and_b32_sdwa v10, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v10, v5, v3 +; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 +; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i16 addrspace(1)* nocapture %dst) { @@ -653,8 +689,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -662,42 +698,51 @@ ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_ubyte v18, v[2:3] +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_ubyte v1, v[2:3] +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v4 -; GFX8-NEXT: v_bfe_u32 v5, v4, 24, 4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 20, 4 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 4 -; GFX8-NEXT: v_bfe_u32 v8, v4, 12, 4 -; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4 -; GFX8-NEXT: v_bfe_u32 v10, v4, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v4 +; GFX8-NEXT: v_and_b32_e32 v16, 15, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_and_b32_e32 v17, 15, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v4 +; GFX8-NEXT: v_and_b32_sdwa v18, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v0 -; GFX8-NEXT: v_bfe_u32 v12, v0, 24, 4 -; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v14, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v15, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v16, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v17, v0, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_and_b32_sdwa v19, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v5, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v15 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v18 -; GFX8-NEXT: v_mad_u16 v0, v10, v17, v0 -; GFX8-NEXT: v_mad_u16 v0, v9, v16, v0 -; GFX8-NEXT: v_mad_u16 v0, v8, v15, v0 -; GFX8-NEXT: v_mad_u16 v0, v7, v14, v0 -; GFX8-NEXT: v_mad_u16 v0, v6, v13, v0 -; GFX8-NEXT: v_mad_u16 v0, v5, v12, v0 -; GFX8-NEXT: v_mad_u16 v0, v1, v11, v0 +; GFX8-NEXT: v_mad_u16 v1, v16, v17, v1 +; GFX8-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX8-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX8-NEXT: v_mad_u16 v1, v5, v10, v1 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX8-NEXT: v_mad_u16 v1, v9, v14, v1 +; GFX8-NEXT: v_mad_u16 v1, v8, v13, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX8-NEXT: v_mad_u16 v1, v18, v19, v1 +; GFX8-NEXT: v_mad_u16 v1, v7, v12, v1 +; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1 +; GFX8-NEXT: v_mad_u16 v0, v6, v11, v0 ; GFX8-NEXT: flat_store_byte v[2:3], v0 ; GFX8-NEXT: s_endpgm ; @@ -716,34 +761,43 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v17, v1, s[2:3] +; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v3 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v2 +; GFX9-NEXT: v_and_b32_e32 v16, 15, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v2 +; GFX9-NEXT: v_and_b32_sdwa v17, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-NEXT: v_bfe_u32 v11, v3, 24, 4 -; GFX9-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_and_b32_sdwa v18, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v3, 15, v9 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v14 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 +; GFX9-NEXT: v_mad_legacy_u16 v4, v15, v16, v4 +; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-NEXT: v_mad_legacy_u16 v3, v3, v9, v4 +; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 +; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-NEXT: v_mad_legacy_u16 v3, v17, v18, v3 +; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 +; GFX9-NEXT: v_mad_legacy_u16 v0, v2, v0, v3 +; GFX9-NEXT: v_mad_legacy_u16 v0, v5, v10, v0 ; GFX9-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -762,34 +816,43 @@ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v17, v1, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 15, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v17, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-DL-NEXT: v_bfe_u32 v11, v3, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_and_b32_sdwa v18, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v14 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 +; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v15, v16, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v3, v9, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v17, v18, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v0, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v5, v10, v0 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -804,41 +867,50 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX10-DL-NEXT: global_load_dword v11, v0, s[4:5] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v11 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v11 +; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4 +; GFX10-DL-NEXT: v_mad_u16 v3, v6, v13, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 12, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v6 +; GFX10-DL-NEXT: v_mov_b32_e32 v5, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v7 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v11 +; GFX10-DL-NEXT: v_mad_u16 v3, v8, v9, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 20, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v9, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v11 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v8 +; GFX10-DL-NEXT: v_mad_u16 v3, v9, v10, v3 +; GFX10-DL-NEXT: v_and_b32_sdwa v10, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v10, v5, v3 +; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i8 addrspace(1)* nocapture %dst) { @@ -972,8 +1044,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -981,42 +1053,51 @@ ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_ubyte v18, v[2:3] +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_ubyte v1, v[2:3] +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 20, 4 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 4 -; GFX8-NEXT: v_bfe_u32 v8, v4, 12, 4 -; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4 -; GFX8-NEXT: v_bfe_u32 v10, v4, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v4 +; GFX8-NEXT: v_and_b32_e32 v18, 15, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 20, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 12, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v0 -; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v14, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v15, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v16, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v17, v0, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_and_b32_e32 v19, 15, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 4, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 12, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v5, 15, v11 +; GFX8-NEXT: v_and_b32_e32 v11, 15, v17 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v18 -; GFX8-NEXT: v_mad_u16 v0, v10, v17, v0 -; GFX8-NEXT: v_mad_u16 v0, v9, v16, v0 -; GFX8-NEXT: v_mad_u16 v0, v8, v15, v0 -; GFX8-NEXT: v_mad_u16 v0, v7, v14, v0 -; GFX8-NEXT: v_mad_u16 v0, v6, v13, v0 -; GFX8-NEXT: v_mad_u16 v0, v5, v12, v0 -; GFX8-NEXT: v_mad_u16 v0, v1, v11, v0 +; GFX8-NEXT: v_mad_u16 v1, v18, v19, v1 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX8-NEXT: v_mad_u16 v1, v5, v11, v1 +; GFX8-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX8-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX8-NEXT: v_mad_u16 v1, v10, v16, v1 +; GFX8-NEXT: v_mad_u16 v1, v9, v15, v1 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1 +; GFX8-NEXT: v_mad_u16 v0, v8, v14, v0 +; GFX8-NEXT: v_mad_u16 v0, v7, v13, v0 +; GFX8-NEXT: v_mad_u16 v0, v6, v12, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: flat_store_byte v[2:3], v0 ; GFX8-NEXT: s_endpgm @@ -1036,34 +1117,43 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v17, v1, s[2:3] +; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX9-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX9-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 4, v3 +; GFX9-NEXT: v_and_b32_e32 v17, 15, v2 +; GFX9-NEXT: v_and_b32_e32 v18, 15, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 20, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 20, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 12, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v3, 15, v10 +; GFX9-NEXT: v_and_b32_e32 v10, 15, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 +; GFX9-NEXT: v_mad_legacy_u16 v4, v17, v18, v4 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-NEXT: v_mad_legacy_u16 v3, v3, v10, v4 +; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-NEXT: v_mad_legacy_u16 v3, v9, v15, v3 +; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v14, v3 +; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-NEXT: v_mad_legacy_u16 v0, v2, v0, v3 +; GFX9-NEXT: v_mad_legacy_u16 v0, v7, v13, v0 +; GFX9-NEXT: v_mad_legacy_u16 v0, v6, v12, v0 +; GFX9-NEXT: v_mad_legacy_u16 v0, v5, v11, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm @@ -1083,34 +1173,43 @@ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v17, v1, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 4, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 4, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v18, 15, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 20, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 12, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 8, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX9-DL-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v16 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 +; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v17, v18, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v3, v10, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v15, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v14, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v0, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v7, v13, v0 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v6, v12, v0 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v5, v11, v0 ; GFX9-DL-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm @@ -1126,42 +1225,51 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX10-DL-NEXT: global_load_dword v11, v0, s[4:5] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v11 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v11 +; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v6, v13, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 12, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v6 +; GFX10-DL-NEXT: v_mov_b32_e32 v5, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v7 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v11 +; GFX10-DL-NEXT: v_mad_u16 v3, v8, v9, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 20, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v10, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v11 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v8 +; GFX10-DL-NEXT: v_mad_u16 v3, v10, v5, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v11 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v5, v10, v3 +; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i4 addrspace(1)* nocapture %dst) { @@ -1279,8 +1387,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1288,42 +1396,51 @@ ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_ubyte v18, v[2:3] +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_ubyte v1, v[2:3] +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 20, 4 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 4 -; GFX8-NEXT: v_bfe_u32 v8, v4, 12, 4 -; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4 -; GFX8-NEXT: v_bfe_u32 v10, v4, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v4 +; GFX8-NEXT: v_and_b32_e32 v18, 15, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 20, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 12, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v0 -; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v14, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v15, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v16, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v17, v0, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_and_b32_e32 v19, 15, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 4, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 12, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v0 +; GFX8-NEXT: v_and_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v5, 15, v11 +; GFX8-NEXT: v_and_b32_e32 v11, 15, v17 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v0, v4, v0, v18 -; GFX8-NEXT: v_mad_u16 v0, v10, v17, v0 -; GFX8-NEXT: v_mad_u16 v0, v9, v16, v0 -; GFX8-NEXT: v_mad_u16 v0, v8, v15, v0 -; GFX8-NEXT: v_mad_u16 v0, v7, v14, v0 -; GFX8-NEXT: v_mad_u16 v0, v6, v13, v0 -; GFX8-NEXT: v_mad_u16 v0, v5, v12, v0 -; GFX8-NEXT: v_mad_u16 v0, v1, v11, v0 +; GFX8-NEXT: v_mad_u16 v1, v18, v19, v1 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX8-NEXT: v_mad_u16 v1, v5, v11, v1 +; GFX8-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX8-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX8-NEXT: v_mad_u16 v1, v10, v16, v1 +; GFX8-NEXT: v_mad_u16 v1, v9, v15, v1 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1 +; GFX8-NEXT: v_mad_u16 v0, v8, v14, v0 +; GFX8-NEXT: v_mad_u16 v0, v7, v13, v0 +; GFX8-NEXT: v_mad_u16 v0, v6, v12, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: flat_store_byte v[2:3], v0 ; GFX8-NEXT: s_endpgm @@ -1343,34 +1460,43 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v17, v1, s[2:3] +; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX9-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX9-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 4, v3 +; GFX9-NEXT: v_and_b32_e32 v17, 15, v2 +; GFX9-NEXT: v_and_b32_e32 v18, 15, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 20, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 20, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 12, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v3, 15, v10 +; GFX9-NEXT: v_and_b32_e32 v10, 15, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 +; GFX9-NEXT: v_mad_legacy_u16 v4, v17, v18, v4 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-NEXT: v_mad_legacy_u16 v3, v3, v10, v4 +; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-NEXT: v_mad_legacy_u16 v3, v9, v15, v3 +; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v14, v3 +; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-NEXT: v_mad_legacy_u16 v0, v2, v0, v3 +; GFX9-NEXT: v_mad_legacy_u16 v0, v7, v13, v0 +; GFX9-NEXT: v_mad_legacy_u16 v0, v6, v12, v0 +; GFX9-NEXT: v_mad_legacy_u16 v0, v5, v11, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm @@ -1390,34 +1516,43 @@ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v17, v1, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 4, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v3, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 4, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v18, 15, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 20, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 12, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 8, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX9-DL-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v16 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v17 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v9, v16, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v8, v15, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v7, v14, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v6, v13, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v5, v12, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v4, v11, v2 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v10, v2 +; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v17, v18, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v3, v10, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v15, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v14, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v0, v3 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v7, v13, v0 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v6, v12, v0 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v5, v11, v0 ; GFX9-DL-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm @@ -1433,42 +1568,51 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX10-DL-NEXT: global_load_dword v11, v0, s[4:5] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v11 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v11 +; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v6, v13, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 12, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v6 +; GFX10-DL-NEXT: v_mov_b32_e32 v5, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v7 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v11 +; GFX10-DL-NEXT: v_mad_u16 v3, v8, v9, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 20, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v10, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v11 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v8 +; GFX10-DL-NEXT: v_mad_u16 v3, v10, v5, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v11 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v5, v10, v3 +; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i4 addrspace(1)* nocapture %dst) { @@ -2168,8 +2312,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2177,42 +2321,51 @@ ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_load_ushort v18, v[2:3] +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v1, v[2:3] +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_and_b32_e32 v1, 15, v4 -; GFX8-NEXT: v_bfe_u32 v5, v4, 4, 4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 8, 4 -; GFX8-NEXT: v_bfe_u32 v7, v4, 12, 4 -; GFX8-NEXT: v_bfe_u32 v8, v4, 16, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 4, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 20, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 28, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v11, 15, v0 -; GFX8-NEXT: v_bfe_u32 v12, v0, 4, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 28, v0 +; GFX8-NEXT: v_and_b32_sdwa v16, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v17, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v18, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX8-NEXT: v_and_b32_e32 v11, 15, v11 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v1, v1, v11, v18 -; GFX8-NEXT: v_bfe_u32 v13, v0, 8, 4 -; GFX8-NEXT: v_mad_u16 v1, v5, v12, v1 -; GFX8-NEXT: v_bfe_u32 v14, v0, 12, 4 -; GFX8-NEXT: v_mad_u16 v1, v6, v13, v1 -; GFX8-NEXT: v_bfe_u32 v15, v0, 16, 4 -; GFX8-NEXT: v_mad_u16 v1, v7, v14, v1 -; GFX8-NEXT: v_bfe_u32 v9, v4, 20, 4 -; GFX8-NEXT: v_bfe_u32 v16, v0, 20, 4 -; GFX8-NEXT: v_mad_u16 v1, v8, v15, v1 -; GFX8-NEXT: v_bfe_u32 v10, v4, 24, 4 -; GFX8-NEXT: v_bfe_u32 v17, v0, 24, 4 -; GFX8-NEXT: v_mad_u16 v1, v9, v16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 28, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0 -; GFX8-NEXT: v_mad_u16 v1, v10, v17, v1 ; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX8-NEXT: v_mad_u16 v0, v6, v11, v0 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX8-NEXT: v_mad_u16 v0, v7, v12, v0 +; GFX8-NEXT: v_mad_u16 v0, v8, v13, v0 +; GFX8-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX8-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX8-NEXT: v_mad_u16 v0, v17, v5, v0 +; GFX8-NEXT: v_mad_u16 v0, v9, v14, v0 +; GFX8-NEXT: v_mad_u16 v0, v16, v18, v0 +; GFX8-NEXT: v_mad_u16 v0, v10, v15, v0 ; GFX8-NEXT: flat_store_short v[2:3], v0 ; GFX8-NEXT: s_endpgm ; @@ -2226,61 +2379,70 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 15 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-NEXT: global_load_dword v4, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_u32 v0, v3, 24, 4 -; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfe_u32 v11, v4, 24, 4 -; GFX9-NEXT: v_bfe_u32 v13, v4, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v4, 8, 4 -; GFX9-NEXT: v_and_b32_e32 v17, 15, v4 -; GFX9-NEXT: v_and_b32_e32 v10, 15, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX9-NEXT: v_and_b32_e32 v6, v2, v6 -; GFX9-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX9-NEXT: v_and_b32_e32 v8, v2, v8 -; GFX9-NEXT: v_bfe_u32 v3, v3, 4, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v4 -; GFX9-NEXT: v_bfe_u32 v14, v4, 20, 4 -; GFX9-NEXT: v_bfe_u32 v16, v4, 12, 4 -; GFX9-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v17, v2, v17 -; GFX9-NEXT: v_and_b32_e32 v11, v2, v11 -; GFX9-NEXT: v_and_b32_e32 v13, v2, v13 -; GFX9-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v10 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v17 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v4 -; GFX9-NEXT: global_load_ushort v4, v1, s[2:3] -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v6, v7, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v5, v14, 16, v13 -; GFX9-NEXT: v_lshl_or_b32 v7, v16, 16, v15 -; GFX9-NEXT: v_lshl_or_b32 v8, v9, 16, v8 -; GFX9-NEXT: v_pk_mul_lo_u16 v3, v6, v5 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v8, v7 -; GFX9-NEXT: v_lshl_or_b32 v10, v12, 16, v11 -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 20, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX9-NEXT: v_and_b32_e32 v16, 15, v1 +; GFX9-NEXT: v_and_b32_e32 v18, 15, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 4, v2 +; GFX9-NEXT: v_and_b32_sdwa v19, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 12, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v4, 15, v10 +; GFX9-NEXT: v_and_b32_e32 v10, 15, v15 +; GFX9-NEXT: v_and_b32_e32 v16, v5, v16 +; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-NEXT: v_and_b32_e32 v18, v5, v18 +; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v11, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v17, v5, v17 +; GFX9-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-NEXT: v_and_b32_e32 v15, v5, v19 +; GFX9-NEXT: v_and_b32_e32 v11, v5, v13 +; GFX9-NEXT: v_and_b32_e32 v5, v5, v8 +; GFX9-NEXT: v_lshl_or_b32 v8, v10, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8 +; GFX9-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-NEXT: v_lshl_or_b32 v6, v12, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v4, v2, v4 -; GFX9-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v2, v2, v5 -; GFX9-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v2, v2, v3 -; GFX9-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v2, v2, v0 -; GFX9-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v7, v6 +; GFX9-NEXT: v_lshl_or_b32 v6, v14, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v5, v9, 16, v5 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v6 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16_vecMul: @@ -2293,61 +2455,70 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v4, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(2) +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_u32 v0, v3, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_bfe_u32 v11, v4, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v4, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v4, 8, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v4 -; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v6, v2, v6 -; GFX9-DL-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v8, v2, v8 -; GFX9-DL-NEXT: v_bfe_u32 v3, v3, 4, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v4, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v4, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v17, v2, v17 -; GFX9-DL-NEXT: v_and_b32_e32 v11, v2, v11 -; GFX9-DL-NEXT: v_and_b32_e32 v13, v2, v13 -; GFX9-DL-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX9-DL-NEXT: v_and_b32_e32 v2, v2, v10 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v17 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] -; GFX9-DL-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v7, 16, v6 -; GFX9-DL-NEXT: v_lshl_or_b32 v5, v14, 16, v13 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v16, 16, v15 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v9, 16, v8 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v6, v5 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v8, v7 -; GFX9-DL-NEXT: v_lshl_or_b32 v10, v12, 16, v11 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v0, v0, v10 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 20, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 15, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v18, 15, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 4, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v19, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 12, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v17, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v15 +; GFX9-DL-NEXT: v_and_b32_e32 v16, v5, v16 +; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-DL-NEXT: v_and_b32_e32 v18, v5, v18 +; GFX9-DL-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v11, 16, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v17, v5, v17 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-DL-NEXT: v_and_b32_e32 v15, v5, v19 +; GFX9-DL-NEXT: v_and_b32_e32 v11, v5, v13 +; GFX9-DL-NEXT: v_and_b32_e32 v5, v5, v8 +; GFX9-DL-NEXT: v_lshl_or_b32 v8, v10, 16, v18 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v16 +; GFX9-DL-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v12, 16, v15 +; GFX9-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v17 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v4, v2, v4 -; GFX9-DL-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v0 -; GFX9-DL-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v7, v6 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v14, 16, v11 +; GFX9-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 +; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 +; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: @@ -2355,7 +2526,8 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10-DL-NEXT: v_mov_b32_e32 v19, 0xffff +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 15 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s10, -1 @@ -2369,55 +2541,63 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 4, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-NEXT: v_bfe_u32 v19, v1, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-NEXT: v_lshl_or_b32 v7, v9, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v15, 16, v6 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v19 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v7, v6 -; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v23, 28, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v16, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v18, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX10-DL-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX10-DL-NEXT: v_and_b32_e32 v15, v19, v18 +; GFX10-DL-NEXT: v_and_b32_e32 v16, v19, v16 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 12, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX10-DL-NEXT: v_lshl_or_b32 v14, v14, 16, v15 +; GFX10-DL-NEXT: v_lshl_or_b32 v15, v10, 16, v16 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX10-DL-NEXT: v_and_b32_e32 v12, v19, v12 +; GFX10-DL-NEXT: v_and_b32_e32 v8, v19, v8 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v10, v15, v14 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v17, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshl_or_b32 v12, v13, 16, v12 +; GFX10-DL-NEXT: v_lshl_or_b32 v15, v9, 16, v8 +; GFX10-DL-NEXT: v_and_b32_sdwa v14, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v10 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v6, v3 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v9, v9, v10 -; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v11, v4, v11 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v3, v12 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 24, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v11 +; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v10, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v11 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v13, v19, v17 +; GFX10-DL-NEXT: v_and_b32_e32 v11, v19, v14 +; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v3, v9 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v8, v15, v12 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX10-DL-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshl_or_b32 v9, v10, 16, v11 +; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v13 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v3, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v1, v19, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GFX10-DL-NEXT: v_add_nc_u16_e64 v14, v3, v9 -; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v10 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v5 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v6 -; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v14, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v9 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v23, 16, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v3, v1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v2 -; GFX10-DL-NEXT: v_add_nc_u16_e64 v1, v3, v5 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX10-DL-NEXT: v_add_nc_u16_e64 v1, v1, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v4, v19, v4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v9 +; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v3, v10 +; GFX10-DL-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v3, v5 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX10-DL-NEXT: v_add_nc_u16_e64 v2, v3, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX10-DL-NEXT: v_add_nc_u16_e64 v1, v2, v1 ; GFX10-DL-NEXT: v_add_nc_u16_e64 v1, v1, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -2557,71 +2737,80 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v2, v[2:3] +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_ubyte v5, v[0:1] +; GFX8-NEXT: flat_load_ubyte v4, v[0:1] +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_bfe_u32 v3, v4, 20, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v7, v4, 24, 4 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 28, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 28, v2 -; GFX8-NEXT: v_bfe_u32 v14, v2, 24, 4 -; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 4 -; GFX8-NEXT: v_bfe_u32 v12, v2, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v10, v4, 12, 4 -; GFX8-NEXT: v_and_b32_e32 v11, 15, v4 -; GFX8-NEXT: v_bfe_u32 v17, v2, 12, 4 -; GFX8-NEXT: v_and_b32_e32 v18, 15, v2 -; GFX8-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX8-NEXT: v_bfe_u32 v2, v2, 4, 4 -; GFX8-NEXT: v_mul_lo_u16_e32 v13, v7, v14 -; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_e32 v19, v6, v12 -; GFX8-NEXT: v_mul_lo_u16_e32 v9, v9, v16 -; GFX8-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_e32 v11, v11, v18 -; GFX8-NEXT: v_mul_lo_u16_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v8, v13, v8 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX8-NEXT: v_or_b32_e32 v10, v11, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX8-NEXT: v_and_b32_sdwa v17, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v18, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v19, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_sdwa v6, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v11, v18, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 +; GFX8-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX8-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX8-NEXT: v_and_b32_e32 v16, 15, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v15 +; GFX8-NEXT: v_mul_lo_u16_e32 v15, v17, v19 +; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v6, v11, v6 +; GFX8-NEXT: v_mul_lo_u16_e32 v8, v8, v13 +; GFX8-NEXT: v_mul_lo_u16_sdwa v9, v9, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_sdwa v10, v2, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX8-NEXT: v_mul_lo_u16_e32 v3, v16, v3 +; GFX8-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX8-NEXT: v_or_b32_e32 v7, v15, v7 +; GFX8-NEXT: v_or_b32_e32 v9, v3, v10 +; GFX8-NEXT: v_or_b32_sdwa v3, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_or_b32_e32 v10, v10, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v10 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v3, v10, v5 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v9 +; GFX8-NEXT: v_add_u16_e32 v4, v9, v4 +; GFX8-NEXT: v_add_u16_e32 v3, v4, v3 +; GFX8-NEXT: v_add_u16_e32 v3, v3, v8 ; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 -; GFX8-NEXT: v_mad_u16 v2, v6, v12, v2 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v8 -; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v8 +; GFX8-NEXT: v_mad_u16 v2, v17, v19, v2 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v6 +; GFX8-NEXT: v_mad_u16 v2, v18, v5, v2 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v6 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2641,53 +2830,62 @@ ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_bfe_u32 v0, v1, 20, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 28, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX9-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v11, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX9-NEXT: v_and_b32_e32 v10, 15, v1 -; GFX9-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX9-NEXT: v_and_b32_e32 v17, 15, v2 -; GFX9-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX9-NEXT: v_bfe_u32 v2, v2, 4, 4 -; GFX9-NEXT: v_mul_lo_u16_e32 v12, v6, v13 -; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v18, v5, v11 -; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15 -; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v10, v10, v17 -; GFX9-NEXT: v_or_b32_e32 v7, v12, v7 -; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_or_b32_e32 v1, v18, v0 -; GFX9-NEXT: v_or_b32_e32 v9, v10, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-NEXT: v_and_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v18, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v11, v16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 15, v2 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX9-NEXT: v_and_b32_e32 v0, 15, v9 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v14 +; GFX9-NEXT: v_mul_lo_u16_e32 v14, v17, v2 +; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v5, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v6, v11, v6 +; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v12 +; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v14, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX9-NEXT: v_mul_lo_u16_e32 v1, v15, v1 +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: v_or_b32_e32 v8, v1, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v1, v9, v4 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v2 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v8 +; GFX9-NEXT: v_add_u16_e32 v4, v8, v4 +; GFX9-NEXT: v_add_u16_e32 v1, v4, v1 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v7 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX9-NEXT: v_mad_legacy_u16 v0, v5, v11, v0 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v7 -; GFX9-NEXT: v_mad_legacy_u16 v0, v6, v13, v0 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v7 +; GFX9-NEXT: v_mad_legacy_u16 v0, v17, v2, v0 +; GFX9-NEXT: v_add_u16_e32 v0, v0, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v6 +; GFX9-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 +; GFX9-NEXT: v_add_u16_e32 v0, v0, v6 ; GFX9-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -2707,53 +2905,62 @@ ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 20, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v1 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 4, 4 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v12, v6, v13 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v5, v11 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v10, v10, v17 -; GFX9-DL-NEXT: v_or_b32_e32 v7, v12, v7 -; GFX9-DL-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-DL-NEXT: v_or_b32_e32 v1, v18, v0 -; GFX9-DL-NEXT: v_or_b32_e32 v9, v10, v2 -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v18, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v11, v16, v18 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v0, 15, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v14 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v14, v17, v2 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, v5, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v6, v11, v6 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v12 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v0, v14, v5 +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, v15, v1 +; GFX9-DL-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-DL-NEXT: v_or_b32_e32 v8, v1, v9 +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-DL-NEXT: v_or_b32_e32 v9, v9, v0 ; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v9 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v1, v9, v4 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 +; GFX9-DL-NEXT: v_add_u16_e32 v4, v8, v4 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v4, v1 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v7 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v5, v11, v0 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v10 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v7 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v6, v13, v0 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v17, v2, v0 +; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 +; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v6 ; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -2773,57 +2980,66 @@ ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-DL-NEXT: global_load_ubyte v3, v19, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v9, v9, v10 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v13 -; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 24, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v1 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 8, v9 -; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 4, 4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v14 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v23, v2, 16, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v1, v1, v15 -; GFX10-DL-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v9, v0, v10 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, v6, v13 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 8, v7 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, v11, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v15, 15, v1 +; GFX10-DL-NEXT: v_and_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v13 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v1, v1, v8 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v18, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v27, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v9 +; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v14 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v12 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v1, 8, v1 -; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, v5, v23 -; GFX10-DL-NEXT: v_or_b32_e32 v7, v10, v7 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 8, v9 -; GFX10-DL-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX10-DL-NEXT: v_or_b32_e32 v2, v11, v9 -; GFX10-DL-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v10 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v6, v11 +; GFX10-DL-NEXT: v_and_b32_e32 v23, 15, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v10 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v0, v0, v2 +; GFX10-DL-NEXT: v_or_b32_e32 v7, v7, v1 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v6 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, v31, v18 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, v23, v9 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v0 +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v9, v15, v13 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v15, v17, v27 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v2 +; GFX10-DL-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX10-DL-NEXT: v_or_b32_sdwa v1, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_e32 v6, v9, v6 +; GFX10-DL-NEXT: v_or_b32_e32 v2, v15, v2 +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v1, v3 +; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v6, v3 ; GFX10-DL-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_add_nc_u16_e64 v9, v3, v10 +; GFX10-DL-NEXT: v_add_nc_u16_e64 v6, v3, v10 ; GFX10-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX10-DL-NEXT: v_add_nc_u16_e64 v0, v9, v8 +; GFX10-DL-NEXT: v_add_nc_u16_e64 v0, v6, v7 ; GFX10-DL-NEXT: v_add_nc_u16_e64 v0, v0, v2 -; GFX10-DL-NEXT: v_mad_u16 v0, v5, v23, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v17, v27, v0 ; GFX10-DL-NEXT: v_add_nc_u16_e64 v0, v0, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v7 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v13, v0 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v5 +; GFX10-DL-NEXT: v_mad_u16 v0, v31, v18, v0 ; GFX10-DL-NEXT: v_add_nc_u16_e64 v0, v0, v1 ; GFX10-DL-NEXT: global_store_byte v19, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -236,11 +236,10 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s1, s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_add_i32 s1, s1, 12 ; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; VI-NEXT: s_or_b32 s0, s1, 4 +; VI-NEXT: s_add_i32 s1, s1, 12 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_or_b32_e64 v1, s1, 4 ; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 Index: llvm/test/CodeGen/SystemZ/scalar-ctlz.ll =================================================================== --- llvm/test/CodeGen/SystemZ/scalar-ctlz.ll +++ llvm/test/CodeGen/SystemZ/scalar-ctlz.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s ; ; FIXME: two consecutive immediate adds not fused in i16/i8 functions. @@ -9,95 +10,90 @@ define i64 @f0(i64 %arg) { ; CHECK-LABEL: f0: -; CHECK-LABEL: %bb.0: -; CHECK-NOT: %bb.1: -; CHECK: flogr +; CHECK: # %bb.0: +; CHECK-NEXT: flogr %r2, %r2 +; CHECK-NEXT: # kill: def $r2d killed $r2d killed $r2q +; CHECK-NEXT: br %r14 %1 = tail call i64 @llvm.ctlz.i64(i64 %arg, i1 false) ret i64 %1 } define i64 @f1(i64 %arg) { ; CHECK-LABEL: f1: -; CHECK-LABEL: %bb.0: -; CHECK-NEXT: flogr -; CHECK-NEXT: # kill -; CHECK-NEXT: br %r14 +; CHECK: # %bb.0: +; CHECK-NEXT: flogr %r2, %r2 +; CHECK-NEXT: # kill: def $r2d killed $r2d killed $r2q +; CHECK-NEXT: br %r14 %1 = tail call i64 @llvm.ctlz.i64(i64 %arg, i1 true) ret i64 %1 } define i32 @f2(i32 %arg) { ; CHECK-LABEL: f2: -; CHECK-LABEL: %bb.0: -; CHECK-NEXT: llgfr %r0, %r2 -; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 -; CHECK-NEXT: # kill -; CHECK-NEXT: br %r14 +; CHECK: # %bb.0: +; CHECK-NEXT: llgfr %r0, %r2 +; CHECK-NEXT: flogr %r2, %r0 +; CHECK-NEXT: ahi %r2, -32 +; CHECK-NEXT: br %r14 %1 = tail call i32 @llvm.ctlz.i32(i32 %arg, i1 false) ret i32 %1 } define i32 @f3(i32 %arg) { ; CHECK-LABEL: f3: -; CHECK-LABEL: %bb.0: -; CHECK-NEXT: llgfr %r0, %r2 -; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 -; CHECK-NEXT: # kill -; CHECK-NEXT: br %r14 +; CHECK: # %bb.0: +; CHECK-NEXT: llgfr %r0, %r2 +; CHECK-NEXT: flogr %r2, %r0 +; CHECK-NEXT: ahi %r2, -32 +; CHECK-NEXT: br %r14 %1 = tail call i32 @llvm.ctlz.i32(i32 %arg, i1 true) ret i32 %1 } define i16 @f4(i16 %arg) { ; CHECK-LABEL: f4: -; CHECK-LABEL: %bb.0: -; CHECK-NEXT: # kill -; CHECK-NEXT: llghr %r0, %r2 -; CHECK-NEXT: flogr %r0, %r0 -; CHECK-NEXT: aghi %r0, -32 -; CHECK-NEXT: ahik %r2, %r0, -16 -; CHECK-NEXT: br %r14 +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d +; CHECK-NEXT: llghr %r0, %r2 +; CHECK-NEXT: flogr %r2, %r0 +; CHECK-NEXT: ahi %r2, -48 +; CHECK-NEXT: br %r14 %1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 false) ret i16 %1 } define i16 @f5(i16 %arg) { ; CHECK-LABEL: f5: -; CHECK-LABEL: %bb.0: -; CHECK-NEXT: # kill -; CHECK-NEXT: llghr %r0, %r2 -; CHECK-NEXT: flogr %r0, %r0 -; CHECK-NEXT: aghi %r0, -32 -; CHECK-NEXT: ahik %r2, %r0, -16 -; CHECK-NEXT: br %r14 +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d +; CHECK-NEXT: llghr %r0, %r2 +; CHECK-NEXT: flogr %r2, %r0 +; CHECK-NEXT: ahi %r2, -48 +; CHECK-NEXT: br %r14 %1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 true) ret i16 %1 } define i8 @f6(i8 %arg) { ; CHECK-LABEL: f6: -; CHECK-LABEL: %bb.0: -; CHECK-NEXT: # kill -; CHECK-NEXT: llgcr %r0, %r2 -; CHECK-NEXT: flogr %r0, %r0 -; CHECK-NEXT: aghi %r0, -32 -; CHECK-NEXT: ahik %r2, %r0, -24 -; CHECK-NEXT: br %r14 +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d +; CHECK-NEXT: llgcr %r0, %r2 +; CHECK-NEXT: flogr %r2, %r0 +; CHECK-NEXT: ahi %r2, -56 +; CHECK-NEXT: br %r14 %1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 false) ret i8 %1 } define i8 @f7(i8 %arg) { ; CHECK-LABEL: f7: -; CHECK-LABEL: %bb.0: -; CHECK-NEXT: # kill -; CHECK-NEXT: llgcr %r0, %r2 -; CHECK-NEXT: flogr %r0, %r0 -; CHECK-NEXT: aghi %r0, -32 -; CHECK-NEXT: ahik %r2, %r0, -24 -; CHECK-NEXT: br %r14 +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d +; CHECK-NEXT: llgcr %r0, %r2 +; CHECK-NEXT: flogr %r2, %r0 +; CHECK-NEXT: ahi %r2, -56 +; CHECK-NEXT: br %r14 %1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 true) ret i8 %1 } Index: llvm/test/CodeGen/X86/and-encoding.ll =================================================================== --- llvm/test/CodeGen/X86/and-encoding.ll +++ llvm/test/CodeGen/X86/and-encoding.ll @@ -22,7 +22,7 @@ define void @f2(i16 %x, i1 *%y) nounwind { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: -; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: andb $1, %dil # encoding: [0x40,0x80,0xe7,0x01] ; CHECK-NEXT: movb %dil, (%rsi) # encoding: [0x40,0x88,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] %c = trunc i16 %x to i1 @@ -33,7 +33,7 @@ define void @f3(i32 %x, i1 *%y) nounwind { ; CHECK-LABEL: f3: ; CHECK: # %bb.0: -; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: andb $1, %dil # encoding: [0x40,0x80,0xe7,0x01] ; CHECK-NEXT: movb %dil, (%rsi) # encoding: [0x40,0x88,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] %c = trunc i32 %x to i1 Index: llvm/test/CodeGen/X86/avx512-calling-conv.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -942,7 +942,7 @@ ; KNL-NEXT: kshiftrw $13, %k0, %k1 ; KNL-NEXT: kmovw %k1, %edi ; KNL-NEXT: kshiftrw $14, %k0, %k1 -; KNL-NEXT: andl $1, %r8d +; KNL-NEXT: andb $1, %r8b ; KNL-NEXT: movb %r8b, 2(%rax) ; KNL-NEXT: kmovw %k0, %r8d ; KNL-NEXT: andl $1, %r8d @@ -1257,7 +1257,7 @@ ; SKX-NEXT: kshiftrd $13, %k0, %k1 ; SKX-NEXT: kmovd %k1, %edi ; SKX-NEXT: kshiftrd $14, %k0, %k1 -; SKX-NEXT: andl $1, %r8d +; SKX-NEXT: andb $1, %r8b ; SKX-NEXT: movb %r8b, 2(%rax) ; SKX-NEXT: kmovd %k0, %r8d ; SKX-NEXT: andl $1, %r8d @@ -1574,7 +1574,7 @@ ; KNL_X32-NEXT: kshiftrw $5, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %ecx ; KNL_X32-NEXT: kshiftrw $6, %k0, %k1 -; KNL_X32-NEXT: andl $1, %ebx +; KNL_X32-NEXT: andb $1, %bl ; KNL_X32-NEXT: movb %bl, 2(%eax) ; KNL_X32-NEXT: kmovw %k0, %ebx ; KNL_X32-NEXT: andl $1, %ebx @@ -1904,7 +1904,7 @@ ; FASTISEL-NEXT: kshiftrd $13, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %edi ; FASTISEL-NEXT: kshiftrd $14, %k0, %k1 -; FASTISEL-NEXT: andl $1, %r8d +; FASTISEL-NEXT: andb $1, %r8b ; FASTISEL-NEXT: movb %r8b, 2(%rax) ; FASTISEL-NEXT: kmovd %k0, %r8d ; FASTISEL-NEXT: andl $1, %r8d Index: llvm/test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-mask-op.ll +++ llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1871,15 +1871,15 @@ define void @store_i16_i1(i16 %x, i1 *%y) { ; CHECK-LABEL: store_i16_i1: ; CHECK: ## %bb.0: -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: andb $1, %dil ; CHECK-NEXT: movb %dil, (%rsi) ; CHECK-NEXT: retq ; ; X86-LABEL: store_i16_i1: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl $1, %ecx +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: andb $1, %cl ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: retl %c = trunc i16 %x to i1 @@ -1890,7 +1890,7 @@ define void @store_i8_i1(i8 %x, i1 *%y) { ; CHECK-LABEL: store_i8_i1: ; CHECK: ## %bb.0: -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: andb $1, %dil ; CHECK-NEXT: movb %dil, (%rsi) ; CHECK-NEXT: retq ; Index: llvm/test/CodeGen/X86/bool-math.ll =================================================================== --- llvm/test/CodeGen/X86/bool-math.ll +++ llvm/test/CodeGen/X86/bool-math.ll @@ -266,7 +266,7 @@ ; X64-NEXT: shrq $32, %rdi ; X64-NEXT: shrq $32, %rax ; X64-NEXT: xorl %edi, %eax -; X64-NEXT: andl $1, %eax +; X64-NEXT: andb $1, %al ; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq ; Index: llvm/test/CodeGen/X86/clz.ll =================================================================== --- llvm/test/CodeGen/X86/clz.ll +++ llvm/test/CodeGen/X86/clz.ll @@ -151,7 +151,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $7, %eax +; X86-NEXT: xorb $7, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -159,7 +159,7 @@ ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: bsrl %eax, %eax -; X64-NEXT: xorl $7, %eax +; X64-NEXT: xorb $7, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -167,7 +167,7 @@ ; X86-CLZ: # %bb.0: ; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-CLZ-NEXT: lzcntl %eax, %eax -; X86-CLZ-NEXT: addl $-24, %eax +; X86-CLZ-NEXT: addb $-24, %al ; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X86-CLZ-NEXT: retl ; @@ -175,7 +175,7 @@ ; X64-CLZ: # %bb.0: ; X64-CLZ-NEXT: movzbl %dil, %eax ; X64-CLZ-NEXT: lzcntl %eax, %eax -; X64-CLZ-NEXT: addl $-24, %eax +; X64-CLZ-NEXT: addb $-24, %al ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq %tmp2 = call i8 @llvm.ctlz.i8( i8 %x, i1 true ) @@ -306,7 +306,7 @@ ; X86-NEXT: # %bb.2: # %cond.false ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $7, %eax +; X86-NEXT: xorb $7, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; X86-NEXT: .LBB8_1: @@ -321,7 +321,7 @@ ; X64-NEXT: # %bb.2: # %cond.false ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: bsrl %eax, %eax -; X64-NEXT: xorl $7, %eax +; X64-NEXT: xorb $7, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; X64-NEXT: .LBB8_1: @@ -333,7 +333,7 @@ ; X86-CLZ: # %bb.0: ; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-CLZ-NEXT: lzcntl %eax, %eax -; X86-CLZ-NEXT: addl $-24, %eax +; X86-CLZ-NEXT: addb $-24, %al ; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X86-CLZ-NEXT: retl ; @@ -341,7 +341,7 @@ ; X64-CLZ: # %bb.0: ; X64-CLZ-NEXT: movzbl %dil, %eax ; X64-CLZ-NEXT: lzcntl %eax, %eax -; X64-CLZ-NEXT: addl $-24, %eax +; X64-CLZ-NEXT: addb $-24, %al ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq %tmp1 = call i8 @llvm.ctlz.i8(i8 %n, i1 false) @@ -873,7 +873,7 @@ ; X86-NEXT: orb $64, %al ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $7, %eax +; X86-NEXT: xorb $7, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -882,7 +882,7 @@ ; X64-NEXT: orb $64, %dil ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: bsrl %eax, %eax -; X64-NEXT: xorl $7, %eax +; X64-NEXT: xorb $7, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -892,7 +892,7 @@ ; X86-CLZ-NEXT: orb $64, %al ; X86-CLZ-NEXT: movzbl %al, %eax ; X86-CLZ-NEXT: lzcntl %eax, %eax -; X86-CLZ-NEXT: addl $-24, %eax +; X86-CLZ-NEXT: addb $-24, %al ; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X86-CLZ-NEXT: retl ; @@ -901,7 +901,7 @@ ; X64-CLZ-NEXT: orb $64, %dil ; X64-CLZ-NEXT: movzbl %dil, %eax ; X64-CLZ-NEXT: lzcntl %eax, %eax -; X64-CLZ-NEXT: addl $-24, %eax +; X64-CLZ-NEXT: addb $-24, %al ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq Index: llvm/test/CodeGen/X86/fast-isel-cmp.ll =================================================================== --- llvm/test/CodeGen/X86/fast-isel-cmp.ll +++ llvm/test/CodeGen/X86/fast-isel-cmp.ll @@ -9,7 +9,7 @@ ; SDAG: ## %bb.0: ; SDAG-NEXT: cmpeqss %xmm1, %xmm0 ; SDAG-NEXT: movd %xmm0, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; @@ -353,7 +353,7 @@ ; SDAG: ## %bb.0: ; SDAG-NEXT: cmpneqss %xmm1, %xmm0 ; SDAG-NEXT: movd %xmm0, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; @@ -593,7 +593,7 @@ ; SDAG-NEXT: xorps %xmm1, %xmm1 ; SDAG-NEXT: cmpeqss %xmm0, %xmm1 ; SDAG-NEXT: movd %xmm1, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; @@ -1248,7 +1248,7 @@ ; SDAG-NEXT: xorps %xmm1, %xmm1 ; SDAG-NEXT: cmpneqss %xmm0, %xmm1 ; SDAG-NEXT: movd %xmm1, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; Index: llvm/test/CodeGen/X86/fptosi-sat-scalar.ll =================================================================== --- llvm/test/CodeGen/X86/fptosi-sat-scalar.ll +++ llvm/test/CodeGen/X86/fptosi-sat-scalar.ll @@ -763,7 +763,7 @@ ; X86-X87-NEXT: movl %ebx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %eax, (%ecx) -; X86-X87-NEXT: andl $15, %edx +; X86-X87-NEXT: andb $15, %dl ; X86-X87-NEXT: movb %dl, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $44, %esp @@ -816,7 +816,7 @@ ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %ecx, (%esi) -; X86-SSE-NEXT: andl $15, %eax +; X86-SSE-NEXT: andb $15, %al ; X86-SSE-NEXT: movb %al, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $28, %esp @@ -1777,7 +1777,7 @@ ; X86-X87-NEXT: movl %ebx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %eax, (%ecx) -; X86-X87-NEXT: andl $15, %edx +; X86-X87-NEXT: andb $15, %dl ; X86-X87-NEXT: movb %dl, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $60, %esp @@ -1830,7 +1830,7 @@ ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %ecx, (%esi) -; X86-SSE-NEXT: andl $15, %eax +; X86-SSE-NEXT: andb $15, %al ; X86-SSE-NEXT: movb %al, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $44, %esp @@ -2891,7 +2891,7 @@ ; X86-X87-NEXT: movl %ebx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %eax, (%ecx) -; X86-X87-NEXT: andl $15, %edx +; X86-X87-NEXT: andb $15, %dl ; X86-X87-NEXT: movb %dl, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $44, %esp @@ -2950,7 +2950,7 @@ ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %ecx, (%esi) -; X86-SSE-NEXT: andl $15, %eax +; X86-SSE-NEXT: andb $15, %al ; X86-SSE-NEXT: movb %al, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $44, %esp @@ -4230,7 +4230,7 @@ ; X86-X87-NEXT: movl %ebx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %eax, (%ecx) -; X86-X87-NEXT: andl $15, %edx +; X86-X87-NEXT: andb $15, %dl ; X86-X87-NEXT: movb %dl, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $60, %esp @@ -4291,7 +4291,7 @@ ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %ecx, (%esi) -; X86-SSE-NEXT: andl $15, %eax +; X86-SSE-NEXT: andb $15, %al ; X86-SSE-NEXT: movb %al, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $44, %esp Index: llvm/test/CodeGen/X86/fptoui-sat-scalar.ll =================================================================== --- llvm/test/CodeGen/X86/fptoui-sat-scalar.ll +++ llvm/test/CodeGen/X86/fptoui-sat-scalar.ll @@ -721,7 +721,7 @@ ; X86-X87-NEXT: movl %edx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %edi, (%ecx) -; X86-X87-NEXT: andl $15, %eax +; X86-X87-NEXT: andb $15, %al ; X86-X87-NEXT: movb %al, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $44, %esp @@ -769,7 +769,7 @@ ; X86-SSE-NEXT: movl %eax, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) ; X86-SSE-NEXT: movl %edx, (%esi) -; X86-SSE-NEXT: andl $15, %ebx +; X86-SSE-NEXT: andb $15, %bl ; X86-SSE-NEXT: movb %bl, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $32, %esp @@ -1647,7 +1647,7 @@ ; X86-X87-NEXT: movl %edx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %edi, (%ecx) -; X86-X87-NEXT: andl $15, %eax +; X86-X87-NEXT: andb $15, %al ; X86-X87-NEXT: movb %al, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $44, %esp @@ -1695,7 +1695,7 @@ ; X86-SSE-NEXT: movl %eax, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) ; X86-SSE-NEXT: movl %edx, (%esi) -; X86-SSE-NEXT: andl $15, %ebx +; X86-SSE-NEXT: andb $15, %bl ; X86-SSE-NEXT: movb %bl, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $32, %esp @@ -2677,7 +2677,7 @@ ; X86-X87-NEXT: movl %edx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %edi, (%ecx) -; X86-X87-NEXT: andl $15, %eax +; X86-X87-NEXT: andb $15, %al ; X86-X87-NEXT: movb %al, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $44, %esp @@ -2731,7 +2731,7 @@ ; X86-SSE-NEXT: movl %eax, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) ; X86-SSE-NEXT: movl %edx, (%esi) -; X86-SSE-NEXT: andl $15, %ebx +; X86-SSE-NEXT: andb $15, %bl ; X86-SSE-NEXT: movb %bl, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $32, %esp @@ -3892,7 +3892,7 @@ ; X86-X87-NEXT: movl %edx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %edi, (%ecx) -; X86-X87-NEXT: andl $15, %eax +; X86-X87-NEXT: andb $15, %al ; X86-X87-NEXT: movb %al, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $60, %esp @@ -3946,7 +3946,7 @@ ; X86-SSE-NEXT: movl %eax, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) ; X86-SSE-NEXT: movl %edx, (%esi) -; X86-SSE-NEXT: andl $15, %ebx +; X86-SSE-NEXT: andb $15, %bl ; X86-SSE-NEXT: movb %bl, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $48, %esp Index: llvm/test/CodeGen/X86/funnel-shift.ll =================================================================== --- llvm/test/CodeGen/X86/funnel-shift.ll +++ llvm/test/CodeGen/X86/funnel-shift.ll @@ -271,7 +271,7 @@ ; X64-AVX2-NEXT: leal (%rdx,%rdx,8), %eax ; X64-AVX2-NEXT: leal (%rdx,%rax,4), %eax ; X64-AVX2-NEXT: subl %eax, %ecx -; X64-AVX2-NEXT: addl $27, %ecx +; X64-AVX2-NEXT: addb $27, %cl ; X64-AVX2-NEXT: shlq $27, %rsi ; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-AVX2-NEXT: shrdq %cl, %rdi, %rsi @@ -357,16 +357,15 @@ ; X32-SSE2-LABEL: fshl_i32_undef0_msk: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: andl $7, %ecx -; X32-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl +; X32-SSE2-NEXT: andb $7, %cl ; X32-SSE2-NEXT: shldl %cl, %eax, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshl_i32_undef0_msk: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: movl %esi, %ecx -; X64-AVX2-NEXT: andl $7, %ecx +; X64-AVX2-NEXT: andb $7, %cl ; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-AVX2-NEXT: shldl %cl, %edi, %eax ; X64-AVX2-NEXT: retq @@ -544,16 +543,15 @@ ; X32-SSE2-LABEL: fshr_i32_undef1_msk: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: andl $7, %ecx -; X32-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl +; X32-SSE2-NEXT: andb $7, %cl ; X32-SSE2-NEXT: shrdl %cl, %eax, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshr_i32_undef1_msk: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: movl %esi, %ecx -; X64-AVX2-NEXT: andl $7, %ecx +; X64-AVX2-NEXT: andb $7, %cl ; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-AVX2-NEXT: shrdl %cl, %edi, %eax ; X64-AVX2-NEXT: retq Index: llvm/test/CodeGen/X86/load-local-v4i5.ll =================================================================== --- llvm/test/CodeGen/X86/load-local-v4i5.ll +++ llvm/test/CodeGen/X86/load-local-v4i5.ll @@ -11,6 +11,9 @@ ; CHECK-NEXT: movb -{{[0-9]+}}(%rsp), %cl ; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movzbl %cl, %edi +; CHECK-NEXT: shrb %cl +; CHECK-NEXT: movb %cl, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: andl $31, %eax ; CHECK-NEXT: andl $31, %esi ; CHECK-NEXT: shll $5, %esi @@ -18,16 +21,12 @@ ; CHECK-NEXT: andl $31, %edx ; CHECK-NEXT: shll $10, %edx ; CHECK-NEXT: orl %esi, %edx -; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: shll $15, %ecx -; CHECK-NEXT: orl %edx, %ecx -; CHECK-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: shrl $16, %ecx -; CHECK-NEXT: andl $15, %ecx -; CHECK-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: cmpb $31, %al +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shll $15, %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: cmpb $31, %dil ; CHECK-NEXT: je .LBB0_2 ; CHECK-NEXT: # %bb.1: # %Then ; CHECK-NEXT: int3 Index: llvm/test/CodeGen/X86/lzcnt.ll =================================================================== --- llvm/test/CodeGen/X86/lzcnt.ll +++ llvm/test/CodeGen/X86/lzcnt.ll @@ -13,7 +13,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: lzcntl %eax, %eax -; X86-NEXT: addl $-24, %eax +; X86-NEXT: addb $-24, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -21,7 +21,7 @@ ; X32: # %bb.0: ; X32-NEXT: movzbl %dil, %eax ; X32-NEXT: lzcntl %eax, %eax -; X32-NEXT: addl $-24, %eax +; X32-NEXT: addb $-24, %al ; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retq ; @@ -29,7 +29,7 @@ ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: lzcntl %eax, %eax -; X64-NEXT: addl $-24, %eax +; X64-NEXT: addb $-24, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %tmp = tail call i8 @llvm.ctlz.i8( i8 %x, i1 false ) @@ -108,7 +108,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: lzcntl %eax, %eax -; X86-NEXT: addl $-24, %eax +; X86-NEXT: addb $-24, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -116,7 +116,7 @@ ; X32: # %bb.0: ; X32-NEXT: movzbl %dil, %eax ; X32-NEXT: lzcntl %eax, %eax -; X32-NEXT: addl $-24, %eax +; X32-NEXT: addb $-24, %al ; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retq ; @@ -124,7 +124,7 @@ ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: lzcntl %eax, %eax -; X64-NEXT: addl $-24, %eax +; X64-NEXT: addb $-24, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %tmp = tail call i8 @llvm.ctlz.i8( i8 %x, i1 true ) Index: llvm/test/CodeGen/X86/masked_store_trunc.ll =================================================================== --- llvm/test/CodeGen/X86/masked_store_trunc.ll +++ llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -390,7 +390,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %else @@ -461,7 +461,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB1_1 ; AVX2-NEXT: # %bb.2: # %else @@ -779,7 +779,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %else @@ -852,7 +852,7 @@ ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1004,7 +1004,7 @@ ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1043,7 +1043,7 @@ ; SSE4-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE4-NEXT: movmskps %xmm3, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB3_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1144,7 +1144,7 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB4_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1187,7 +1187,7 @@ ; SSE4-NEXT: packusdw %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB4_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1227,7 +1227,7 @@ ; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB4_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1269,7 +1269,7 @@ ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB4_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1375,7 +1375,7 @@ ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: jne .LBB5_1 @@ -1418,7 +1418,7 @@ ; SSE4-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE4-NEXT: movmskps %xmm3, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB5_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1458,7 +1458,7 @@ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB5_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1500,7 +1500,7 @@ ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB5_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1603,7 +1603,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB6_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1626,7 +1626,7 @@ ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE4-NEXT: movmskpd %xmm2, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB6_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1707,7 +1707,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB7_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1732,7 +1732,7 @@ ; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE4-NEXT: movmskpd %xmm2, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB7_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1755,7 +1755,7 @@ ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB7_1 ; AVX-NEXT: # %bb.2: # %else @@ -1830,7 +1830,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB8_1 @@ -1853,7 +1853,7 @@ ; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE4-NEXT: movmskpd %xmm2, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB8_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1875,7 +1875,7 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB8_1 ; AVX-NEXT: # %bb.2: # %else @@ -3476,7 +3476,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB11_1 ; AVX1-NEXT: # %bb.2: # %else @@ -3543,7 +3543,7 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB11_1 ; AVX2-NEXT: # %bb.2: # %else @@ -3853,7 +3853,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB12_1 ; AVX1-NEXT: # %bb.2: # %else @@ -3923,7 +3923,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB12_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4082,7 +4082,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB13_1 ; SSE2-NEXT: # %bb.2: # %else @@ -4122,7 +4122,7 @@ ; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB13_1 ; SSE4-NEXT: # %bb.2: # %else @@ -4158,7 +4158,7 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskps %xmm1, %eax -; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: xorb $15, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB13_1 ; AVX-NEXT: # %bb.2: # %else @@ -4256,7 +4256,7 @@ ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: jne .LBB14_1 @@ -4296,7 +4296,7 @@ ; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB14_1 ; SSE4-NEXT: # %bb.2: # %else @@ -4332,7 +4332,7 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskps %xmm1, %eax -; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: xorb $15, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB14_1 ; AVX-NEXT: # %bb.2: # %else Index: llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll =================================================================== --- llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -708,7 +708,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %else @@ -787,7 +787,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB1_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1259,7 +1259,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1339,7 +1339,7 @@ ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1555,7 +1555,7 @@ ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm9 ; SSE2-NEXT: movmskps %xmm9, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1611,7 +1611,7 @@ ; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 ; SSE4-NEXT: movmskps %xmm4, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB3_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1787,7 +1787,7 @@ ; SSE2-NEXT: packssdw %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm9 ; SSE2-NEXT: movmskps %xmm9, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB4_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1845,7 +1845,7 @@ ; SSE4-NEXT: packssdw %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 ; SSE4-NEXT: movmskps %xmm4, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB4_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1893,7 +1893,7 @@ ; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB4_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1939,7 +1939,7 @@ ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB4_1 ; AVX2-NEXT: # %bb.2: # %else @@ -2107,7 +2107,7 @@ ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm9 ; SSE2-NEXT: movmskps %xmm9, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: jne .LBB5_1 @@ -2167,7 +2167,7 @@ ; SSE4-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 ; SSE4-NEXT: movmskps %xmm4, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB5_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2217,7 +2217,7 @@ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB5_1 ; AVX1-NEXT: # %bb.2: # %else @@ -2265,7 +2265,7 @@ ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB5_1 ; AVX2-NEXT: # %bb.2: # %else @@ -2402,7 +2402,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB6_1 ; SSE2-NEXT: # %bb.2: # %else @@ -2434,7 +2434,7 @@ ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB6_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2563,7 +2563,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB7_1 ; SSE2-NEXT: # %bb.2: # %else @@ -2597,7 +2597,7 @@ ; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB7_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2626,7 +2626,7 @@ ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB7_1 ; AVX-NEXT: # %bb.2: # %else @@ -2735,7 +2735,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm3, %ecx ; SSE2-NEXT: jne .LBB8_1 @@ -2767,7 +2767,7 @@ ; SSE4-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB8_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2795,7 +2795,7 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB8_1 ; AVX-NEXT: # %bb.2: # %else @@ -4384,7 +4384,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB11_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4451,7 +4451,7 @@ ; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB11_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4760,7 +4760,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB12_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4828,7 +4828,7 @@ ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB12_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4993,7 +4993,7 @@ ; SSE2-NEXT: packssdw %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB13_1 ; SSE2-NEXT: # %bb.2: # %else @@ -5033,7 +5033,7 @@ ; SSE4-NEXT: packssdw %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB13_1 ; SSE4-NEXT: # %bb.2: # %else @@ -5069,7 +5069,7 @@ ; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskps %xmm1, %eax -; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: xorb $15, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB13_1 ; AVX-NEXT: # %bb.2: # %else @@ -5185,7 +5185,7 @@ ; SSE2-NEXT: packuswb %xmm3, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: jne .LBB14_1 @@ -5228,7 +5228,7 @@ ; SSE4-NEXT: packsswb %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB14_1 ; SSE4-NEXT: # %bb.2: # %else @@ -5267,7 +5267,7 @@ ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB14_1 ; AVX1-NEXT: # %bb.2: # %else @@ -5308,7 +5308,7 @@ ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB14_1 ; AVX2-NEXT: # %bb.2: # %else Index: llvm/test/CodeGen/X86/masked_store_trunc_usat.ll =================================================================== --- llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -582,7 +582,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %else @@ -660,7 +660,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB1_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1064,7 +1064,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1143,7 +1143,7 @@ ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1327,7 +1327,7 @@ ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1380,7 +1380,7 @@ ; SSE4-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm6 ; SSE4-NEXT: movmskps %xmm6, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB3_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1527,7 +1527,7 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB4_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1582,7 +1582,7 @@ ; SSE4-NEXT: packusdw %xmm5, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm6 ; SSE4-NEXT: movmskps %xmm6, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB4_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1629,7 +1629,7 @@ ; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB4_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1675,7 +1675,7 @@ ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB4_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1811,7 +1811,7 @@ ; SSE2-NEXT: packuswb %xmm4, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm9 ; SSE2-NEXT: movmskps %xmm9, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm4, %eax ; SSE2-NEXT: jne .LBB5_1 @@ -1868,7 +1868,7 @@ ; SSE4-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm8 ; SSE4-NEXT: movmskps %xmm8, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB5_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1917,7 +1917,7 @@ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB5_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1965,7 +1965,7 @@ ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB5_1 ; AVX2-NEXT: # %bb.2: # %else @@ -2085,7 +2085,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB6_1 ; SSE2-NEXT: # %bb.2: # %else @@ -2115,7 +2115,7 @@ ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB6_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2225,7 +2225,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB7_1 ; SSE2-NEXT: # %bb.2: # %else @@ -2257,7 +2257,7 @@ ; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB7_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2285,7 +2285,7 @@ ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB7_1 ; AVX-NEXT: # %bb.2: # %else @@ -2377,7 +2377,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm4, %ecx ; SSE2-NEXT: jne .LBB8_1 @@ -2407,7 +2407,7 @@ ; SSE4-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm4 ; SSE4-NEXT: movmskpd %xmm4, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB8_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2434,7 +2434,7 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB8_1 ; AVX-NEXT: # %bb.2: # %else @@ -4110,7 +4110,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB11_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4179,7 +4179,7 @@ ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB11_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4506,7 +4506,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB12_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4576,7 +4576,7 @@ ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB12_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4745,7 +4745,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB13_1 ; SSE2-NEXT: # %bb.2: # %else @@ -4786,7 +4786,7 @@ ; SSE4-NEXT: packusdw %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB13_1 ; SSE4-NEXT: # %bb.2: # %else @@ -4823,7 +4823,7 @@ ; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB13_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4861,7 +4861,7 @@ ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB13_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4971,7 +4971,7 @@ ; SSE2-NEXT: packuswb %xmm4, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm4, %eax ; SSE2-NEXT: jne .LBB14_1 @@ -5013,7 +5013,7 @@ ; SSE4-NEXT: packuswb %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB14_1 ; SSE4-NEXT: # %bb.2: # %else @@ -5051,7 +5051,7 @@ ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB14_1 ; AVX1-NEXT: # %bb.2: # %else @@ -5090,7 +5090,7 @@ ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB14_1 ; AVX2-NEXT: # %bb.2: # %else Index: llvm/test/CodeGen/X86/movmsk-cmp.ll =================================================================== --- llvm/test/CodeGen/X86/movmsk-cmp.ll +++ llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -4109,8 +4109,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax -; SSE2-NEXT: cmpb $3, %al +; SSE2-NEXT: testb %al, %al ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -4118,8 +4117,7 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 ; SSE41-NEXT: movmskpd %xmm0, %eax -; SSE41-NEXT: xorl $3, %eax -; SSE41-NEXT: cmpb $3, %al +; SSE41-NEXT: testb %al, %al ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -4127,8 +4125,7 @@ ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX1OR2-NEXT: vmovmskpd %xmm0, %eax -; AVX1OR2-NEXT: xorl $3, %eax -; AVX1OR2-NEXT: cmpb $3, %al +; AVX1OR2-NEXT: testb %al, %al ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; @@ -4165,7 +4162,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorb $3, %al +; SSE2-NEXT: cmpb $3, %al ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -4173,7 +4170,7 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 ; SSE41-NEXT: movmskpd %xmm0, %eax -; SSE41-NEXT: xorb $3, %al +; SSE41-NEXT: cmpb $3, %al ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -4181,7 +4178,7 @@ ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX1OR2-NEXT: vmovmskpd %xmm0, %eax -; AVX1OR2-NEXT: xorb $3, %al +; AVX1OR2-NEXT: cmpb $3, %al ; AVX1OR2-NEXT: setne %al ; AVX1OR2-NEXT: retq ; Index: llvm/test/CodeGen/X86/mul-constant-i8.ll =================================================================== --- llvm/test/CodeGen/X86/mul-constant-i8.ll +++ llvm/test/CodeGen/X86/mul-constant-i8.ll @@ -463,7 +463,7 @@ ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: addl %edi, %edi ; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: negl %eax +; X64-NEXT: negb %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, -10 @@ -476,7 +476,7 @@ ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: shll $2, %edi ; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: negl %eax +; X64-NEXT: negb %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, -36 Index: llvm/test/CodeGen/X86/parity.ll =================================================================== --- llvm/test/CodeGen/X86/parity.ll +++ llvm/test/CodeGen/X86/parity.ll @@ -338,14 +338,14 @@ ; X86-POPCNT-LABEL: parity_32_trunc: ; X86-POPCNT: # %bb.0: ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax -; X86-POPCNT-NEXT: andl $1, %eax +; X86-POPCNT-NEXT: andb $1, %al ; X86-POPCNT-NEXT: # kill: def $al killed $al killed $eax ; X86-POPCNT-NEXT: retl ; ; X64-POPCNT-LABEL: parity_32_trunc: ; X64-POPCNT: # %bb.0: ; X64-POPCNT-NEXT: popcntl %edi, %eax -; X64-POPCNT-NEXT: andl $1, %eax +; X64-POPCNT-NEXT: andb $1, %al ; X64-POPCNT-NEXT: # kill: def $al killed $al killed $eax ; X64-POPCNT-NEXT: retq %1 = tail call i32 @llvm.ctpop.i32(i32 %x) Index: llvm/test/CodeGen/X86/pr15267.ll =================================================================== --- llvm/test/CodeGen/X86/pr15267.ll +++ llvm/test/CodeGen/X86/pr15267.ll @@ -89,62 +89,75 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq (%rdi), %rax ; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: shrl $4, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: shrb $4, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: andl $15, %edx +; CHECK-NEXT: andb $15, %dl +; CHECK-NEXT: movzbl %dl, %edx ; CHECK-NEXT: vmovd %edx, %xmm0 ; CHECK-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $8, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $12, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $16, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $20, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $24, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $28, %ecx ; CHECK-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $36, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $40, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $44, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $48, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $52, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $56, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: shrq $60, %rax ; CHECK-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 Index: llvm/test/CodeGen/X86/pr40539.ll =================================================================== --- llvm/test/CodeGen/X86/pr40539.ll +++ llvm/test/CodeGen/X86/pr40539.ll @@ -18,7 +18,7 @@ ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cmpeqss (%esp), %xmm0 ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: popl %ecx ; CHECK-NEXT: .cfi_def_cfa_offset 4 Index: llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll =================================================================== --- llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll +++ llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll @@ -15,7 +15,7 @@ ; ; CHECK32-LABEL: f32_bzhi: ; CHECK32: # %bb.0: # %entry -; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al ; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax ; CHECK32-NEXT: retl entry: @@ -34,7 +34,7 @@ ; ; CHECK32-LABEL: f32_bzhi_partial: ; CHECK32: # %bb.0: # %entry -; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al ; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax ; CHECK32-NEXT: retl entry: Index: llvm/test/CodeGen/X86/setoeq.ll =================================================================== --- llvm/test/CodeGen/X86/setoeq.ll +++ llvm/test/CodeGen/X86/setoeq.ll @@ -9,7 +9,7 @@ ; CHECK-NEXT: cvtdq2pd %xmm1, %xmm1 ; CHECK-NEXT: cmpeqsd %xmm0, %xmm1 ; CHECK-NEXT: movd %xmm1, %eax -; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retl entry: @@ -28,7 +28,7 @@ ; CHECK-NEXT: cvtdq2pd %xmm1, %xmm1 ; CHECK-NEXT: cmpneqsd %xmm0, %xmm1 ; CHECK-NEXT: movd %xmm1, %eax -; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retl entry: Index: llvm/test/CodeGen/X86/vector-compare-all_of.ll =================================================================== --- llvm/test/CodeGen/X86/vector-compare-all_of.ll +++ llvm/test/CodeGen/X86/vector-compare-all_of.ll @@ -1066,8 +1066,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE-NEXT: movmskps %xmm0, %eax -; SSE-NEXT: xorl $15, %eax -; SSE-NEXT: cmpb $15, %al +; SSE-NEXT: testb %al, %al ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -1075,8 +1074,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovmskps %xmm0, %eax -; AVX-NEXT: xorl $15, %eax -; AVX-NEXT: cmpb $15, %al +; AVX-NEXT: testb %al, %al ; AVX-NEXT: sete %al ; AVX-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-compare-any_of.ll =================================================================== --- llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -963,7 +963,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE-NEXT: movmskps %xmm0, %eax -; SSE-NEXT: xorb $15, %al +; SSE-NEXT: cmpb $15, %al ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; @@ -971,7 +971,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovmskps %xmm0, %eax -; AVX-NEXT: xorb $15, %al +; AVX-NEXT: cmpb $15, %al ; AVX-NEXT: setne %al ; AVX-NEXT: retq ;