Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -663,14 +663,6 @@ /// affected nodes are stored as a prefix in \p StoreNodes). bool MergeConsecutiveStores(StoreSDNode *St); - /// Try to transform a truncation where C is a constant: - /// (trunc (and X, C)) -> (and (trunc X), (trunc C)) - /// - /// \p N needs to be a truncation and its first operand an AND. Other - /// requirements are checked by the function (e.g. that trunc is - /// single-use) and if missed an empty SDValue is returned. - SDValue distributeTruncateThroughAnd(SDNode *N); - /// Helper function to determine whether the target supports operation /// given by \p Opcode for type \p VT, that is, whether the operation /// is legal or custom before legalizing operations, and whether is @@ -7404,29 +7396,6 @@ return DAG.getNode(LHS.getOpcode(), DL, VT, NewShift, NewRHS); } -SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) { - assert(N->getOpcode() == ISD::TRUNCATE); - assert(N->getOperand(0).getOpcode() == ISD::AND); - - // (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC) - EVT TruncVT = N->getValueType(0); - if (N->hasOneUse() && N->getOperand(0).hasOneUse() && - TLI.isTypeDesirableForOp(ISD::AND, TruncVT)) { - SDValue N01 = N->getOperand(0).getOperand(1); - if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) { - SDLoc DL(N); - SDValue N00 = N->getOperand(0).getOperand(0); - SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00); - SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01); - AddToWorklist(Trunc00.getNode()); - AddToWorklist(Trunc01.getNode()); - return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01); - } - } - - return SDValue(); -} - SDValue DAGCombiner::visitRotate(SDNode *N) { SDLoc dl(N); SDValue N0 = N->getOperand(0); @@ -7455,13 +7424,6 @@ } } - // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))). - if (N1.getOpcode() == ISD::TRUNCATE && - N1.getOperand(0).getOpcode() == ISD::AND) { - if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) - return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1); - } - unsigned NextOp = N0.getOpcode(); // fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize) if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) { @@ -7536,13 +7498,6 @@ APInt::getAllOnesValue(OpSizeInBits))) return DAG.getConstant(0, SDLoc(N), VT); - // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))). - if (N1.getOpcode() == ISD::TRUNCATE && - N1.getOperand(0).getOpcode() == ISD::AND) { - if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) - return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1); - } - // TODO - support non-uniform vector shift amounts. if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); @@ -7879,13 +7834,6 @@ } } - // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))). - if (N1.getOpcode() == ISD::TRUNCATE && - N1.getOperand(0).getOpcode() == ISD::AND) { - if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) - return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1); - } - // fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2)) // fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2)) // if c1 is equal to the number of bits the trunc removes @@ -8087,13 +8035,6 @@ } } - // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))). - if (N1.getOpcode() == ISD::TRUNCATE && - N1.getOperand(0).getOpcode() == ISD::AND) { - if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) - return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1); - } - // fold operands of srl based on knowledge that the low bits are not // demanded. // TODO - support non-uniform vector shift amounts. @@ -10898,9 +10839,10 @@ case ISD::AND: case ISD::OR: case ISD::XOR: - if (!LegalOperations && N0.hasOneUse() && + if ((!LegalOperations || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) && (isConstantOrConstantVector(N0.getOperand(0), true) || - isConstantOrConstantVector(N0.getOperand(1), true))) { + isConstantOrConstantVector(N0.getOperand(1), true)) && + N0.hasOneUse()) { // TODO: We already restricted this to pre-legalization, but for vectors // we are extra cautious to not create an unsupported operation. // Target-specific changes are likely needed to avoid regressions here. Index: llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -120,16 +120,15 @@ ; GCN-LABEL: {{^}}sink_ubfe_i16: ; GCN-NOT: lshr -; VI: s_load_dword [[ARG:s[0-9]+]], s[0:1], 0x2c -; VI: s_bfe_u32 [[BFE:s[0-9]+]], [[ARG]], 0xc0004 -; GCN: s_cbranch_scc1 - ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 -; VI: v_mov_b32_e32 v{{[0-9]+}}, 0xff +; VI: s_load_dword [[ARG:s[0-9]+]], s[0:1], 0x2c +; VI: v_mov_b32_e32 {{v[0-9]+}}, 0xfff +; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 +; VI: s_cbranch_scc1 ; GCN: BB2_2: ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 -; VI: v_mov_b32_e32 v{{[0-9]+}}, 0x7f +; VI: v_and_b32_e32 v{{[0-9]+}}, 0x7f ; GCN: BB2_3: ; GCN: buffer_store_short Index: llvm/test/CodeGen/AMDGPU/idot8s.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot8s.ll +++ llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -1710,66 +1710,87 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e64 v0, s2, 15 +; GFX9-NEXT: v_and_b32_e64 v5, s4, 15 +; GFX9-NEXT: s_lshr_b32 s9, s2, 8 +; GFX9-NEXT: s_lshr_b32 s11, s2, 4 +; GFX9-NEXT: s_lshr_b32 s16, s4, 8 +; GFX9-NEXT: s_lshr_b32 s18, s4, 4 +; GFX9-NEXT: v_and_b32_e64 v3, s9, 15 +; GFX9-NEXT: v_and_b32_e64 v7, s16, 15 +; GFX9-NEXT: s_lshr_b32 s7, s2, 16 +; GFX9-NEXT: s_lshr_b32 s14, s4, 16 +; GFX9-NEXT: s_lshr_b32 s10, s2, 12 +; GFX9-NEXT: s_lshr_b32 s17, s4, 12 +; GFX9-NEXT: v_and_b32_e64 v1, s11, 15 +; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX9-NEXT: v_and_b32_e64 v6, s18, 15 +; GFX9-NEXT: v_and_b32_e32 v5, v2, v5 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX9-NEXT: v_and_b32_e64 v4, s7, 15 +; GFX9-NEXT: v_and_b32_e64 v8, s14, 15 +; GFX9-NEXT: s_lshr_b32 s8, s2, 20 +; GFX9-NEXT: s_lshr_b32 s15, s4, 20 +; GFX9-NEXT: v_and_b32_e64 v1, s10, 15 +; GFX9-NEXT: v_and_b32_e32 v3, v2, v3 +; GFX9-NEXT: v_and_b32_e64 v6, s17, 15 +; GFX9-NEXT: v_and_b32_e32 v7, v2, v7 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX9-NEXT: s_lshr_b32 s6, s2, 24 +; GFX9-NEXT: s_lshr_b32 s13, s4, 24 +; GFX9-NEXT: v_and_b32_e64 v3, s8, 15 +; GFX9-NEXT: v_and_b32_e32 v4, v2, v4 +; GFX9-NEXT: v_and_b32_e64 v7, s15, 15 +; GFX9-NEXT: v_and_b32_e32 v8, v2, v8 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX9-NEXT: v_and_b32_e64 v4, s6, 15 +; GFX9-NEXT: v_and_b32_e64 v8, s13, 15 +; GFX9-NEXT: v_and_b32_e32 v4, v2, v4 +; GFX9-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-NEXT: s_lshr_b32 s12, s4, 28 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX9-NEXT: v_lshl_or_b32 v4, s5, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v2, s12, 16, v2 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v4, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v1, v6 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s5, s4, 15 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s5, s6 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s5, s6 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40014 -; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s9, s10 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s13, s2 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x4000c -; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s7, s8 +; GFX9-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, v3 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 -; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s12 -; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s14, s4 -; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v9 -; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v6, v6, v10 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v6, v5, v6 +; GFX9-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1777,66 +1798,87 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_and_b32_e64 v0, s2, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v5, s4, 15 +; GFX9-DL-NEXT: s_lshr_b32 s9, s2, 8 +; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 4 +; GFX9-DL-NEXT: s_lshr_b32 s16, s4, 8 +; GFX9-DL-NEXT: s_lshr_b32 s18, s4, 4 +; GFX9-DL-NEXT: v_and_b32_e64 v3, s9, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v7, s16, 15 +; GFX9-DL-NEXT: s_lshr_b32 s7, s2, 16 +; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 16 +; GFX9-DL-NEXT: s_lshr_b32 s10, s2, 12 +; GFX9-DL-NEXT: s_lshr_b32 s17, s4, 12 +; GFX9-DL-NEXT: v_and_b32_e64 v1, s11, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX9-DL-NEXT: v_and_b32_e64 v6, s18, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v5, v2, v5 +; GFX9-DL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX9-DL-NEXT: v_and_b32_e64 v4, s7, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v8, s14, 15 +; GFX9-DL-NEXT: s_lshr_b32 s8, s2, 20 +; GFX9-DL-NEXT: s_lshr_b32 s15, s4, 20 +; GFX9-DL-NEXT: v_and_b32_e64 v1, s10, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v3, v2, v3 +; GFX9-DL-NEXT: v_and_b32_e64 v6, s17, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v7, v2, v7 +; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX9-DL-NEXT: s_lshr_b32 s6, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s13, s4, 24 +; GFX9-DL-NEXT: v_and_b32_e64 v3, s8, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v4, v2, v4 +; GFX9-DL-NEXT: v_and_b32_e64 v7, s15, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v8, v2, v8 +; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v4 +; GFX9-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX9-DL-NEXT: v_and_b32_e64 v4, s6, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v8, s13, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v4, v2, v4 +; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-DL-NEXT: s_lshr_b32 s12, s4, 28 +; GFX9-DL-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, s5, 16, v4 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, s12, 16, v2 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v1, v6 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v0, v5 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s5, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40014 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s9, s10 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s13, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x4000c -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s7, s8 +; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, v3 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v7 -; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s12 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s14, s4 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v9 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v6, v5, v6 +; GFX9-DL-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-DL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1844,67 +1886,88 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v15, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s5, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40008 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s5 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40010 +; GFX10-DL-NEXT: v_and_b32_e64 v4, s2, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v5, s4, 15 +; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 4 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 4 +; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 8 +; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 8 +; GFX10-DL-NEXT: v_and_b32_e32 v4, v15, v4 +; GFX10-DL-NEXT: v_and_b32_e64 v6, s0, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v7, s1, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v5, v15, v5 +; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 12 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 12 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v6, 16, v4 +; GFX10-DL-NEXT: v_and_b32_e64 v6, s5, 15 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX10-DL-NEXT: v_and_b32_e64 v2, s6, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v8, s0, 15 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_and_b32_e32 v6, v15, v6 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_and_b32_e64 v9, s1, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v7, v15, v2 +; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 16 +; GFX10-DL-NEXT: v_lshl_or_b32 v6, v8, 16, v6 +; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 20 +; GFX10-DL-NEXT: v_lshl_or_b32 v7, v9, 16, v7 +; GFX10-DL-NEXT: v_and_b32_e64 v8, s5, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v9, s6, 15 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 20 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s5 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s8 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_and_b32_e64 v10, s0, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v8, v15, v8 +; GFX10-DL-NEXT: v_and_b32_e64 v11, s1, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v9, v15, v9 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s2 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s4 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s1 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v6, v7 +; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 24 +; GFX10-DL-NEXT: v_lshl_or_b32 v8, v10, 16, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v9, v11, 16, v9 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_and_b32_e64 v5, s5, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v10, s6, 15 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 28 +; GFX10-DL-NEXT: v_and_b32_e32 v5, v15, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v2, v15, v10 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, s0, 16, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, s1, 16, v2 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v9 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v5, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v7 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v8 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, Index: llvm/test/CodeGen/AMDGPU/idot8u.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot8u.ll +++ llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2079,52 +2079,69 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v14, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s9, s2, 8 +; GFX9-NEXT: v_and_b32_e64 v3, s9, 15 +; GFX9-NEXT: s_lshr_b32 s10, s2, 12 +; GFX9-NEXT: v_and_b32_e64 v0, s2, 15 +; GFX9-NEXT: s_lshr_b32 s7, s2, 16 +; GFX9-NEXT: s_lshr_b32 s11, s2, 4 +; GFX9-NEXT: s_lshr_b32 s18, s4, 4 +; GFX9-NEXT: v_and_b32_e64 v2, s10, 15 +; GFX9-NEXT: v_and_b32_e32 v3, v14, v3 +; GFX9-NEXT: v_and_b32_e64 v7, s4, 15 +; GFX9-NEXT: v_and_b32_e64 v5, s7, 15 +; GFX9-NEXT: s_lshr_b32 s8, s2, 20 +; GFX9-NEXT: s_lshr_b32 s16, s4, 8 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX9-NEXT: v_and_b32_e64 v1, s11, 15 +; GFX9-NEXT: v_and_b32_e32 v0, v14, v0 +; GFX9-NEXT: v_and_b32_e64 v8, s18, 15 +; GFX9-NEXT: v_and_b32_e32 v3, v14, v7 +; GFX9-NEXT: s_lshr_b32 s17, s4, 12 +; GFX9-NEXT: v_and_b32_e64 v4, s8, 15 +; GFX9-NEXT: v_and_b32_e32 v5, v14, v5 +; GFX9-NEXT: v_and_b32_e64 v10, s16, 15 +; GFX9-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX9-NEXT: v_pk_mul_lo_u16 v3, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_and_b32_e64 v9, s17, 15 +; GFX9-NEXT: v_and_b32_e32 v5, v14, v10 +; GFX9-NEXT: v_lshl_or_b32 v5, v9, 16, v5 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v3, s0, v3 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s0, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, s1, v4 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, s5, v5 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: v_pk_mul_lo_u16 v6, s0, v6 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v5 +; GFX9-NEXT: global_load_ushort v5, v[0:1], off +; GFX9-NEXT: s_lshr_b32 s14, s4, 16 +; GFX9-NEXT: v_and_b32_e64 v12, s14, 15 +; GFX9-NEXT: s_lshr_b32 s15, s4, 20 +; GFX9-NEXT: s_lshr_b32 s6, s2, 24 +; GFX9-NEXT: s_lshr_b32 s13, s4, 24 +; GFX9-NEXT: v_and_b32_e64 v11, s15, 15 +; GFX9-NEXT: v_and_b32_e32 v12, v14, v12 +; GFX9-NEXT: v_and_b32_e64 v6, s6, 15 +; GFX9-NEXT: v_and_b32_e64 v13, s13, 15 +; GFX9-NEXT: v_lshl_or_b32 v11, v11, 16, v12 +; GFX9-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-NEXT: v_and_b32_e32 v6, v14, v6 +; GFX9-NEXT: s_lshr_b32 s12, s4, 28 +; GFX9-NEXT: v_and_b32_e32 v13, v14, v13 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v11 +; GFX9-NEXT: v_lshl_or_b32 v13, s12, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v6, s5, 16, v6 +; GFX9-NEXT: v_pk_mul_lo_u16 v6, v6, v13 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, v3, v5 +; GFX9-NEXT: v_add_u32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v[0:1], v2, off @@ -2134,52 +2151,69 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v14, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_lshr_b32 s9, s2, 8 +; GFX9-DL-NEXT: v_and_b32_e64 v3, s9, 15 +; GFX9-DL-NEXT: s_lshr_b32 s10, s2, 12 +; GFX9-DL-NEXT: v_and_b32_e64 v0, s2, 15 +; GFX9-DL-NEXT: s_lshr_b32 s7, s2, 16 +; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 4 +; GFX9-DL-NEXT: s_lshr_b32 s18, s4, 4 +; GFX9-DL-NEXT: v_and_b32_e64 v2, s10, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v3, v14, v3 +; GFX9-DL-NEXT: v_and_b32_e64 v7, s4, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v5, s7, 15 +; GFX9-DL-NEXT: s_lshr_b32 s8, s2, 20 +; GFX9-DL-NEXT: s_lshr_b32 s16, s4, 8 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX9-DL-NEXT: v_and_b32_e64 v1, s11, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v0, v14, v0 +; GFX9-DL-NEXT: v_and_b32_e64 v8, s18, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v3, v14, v7 +; GFX9-DL-NEXT: s_lshr_b32 s17, s4, 12 +; GFX9-DL-NEXT: v_and_b32_e64 v4, s8, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v5, v14, v5 +; GFX9-DL-NEXT: v_and_b32_e64 v10, s16, 15 +; GFX9-DL-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX9-DL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v0, v3 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_and_b32_e64 v9, s17, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v5, v14, v10 +; GFX9-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s0, v3 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s1, v4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s5, v5 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, s0, v6 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5 +; GFX9-DL-NEXT: global_load_ushort v5, v[0:1], off +; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 16 +; GFX9-DL-NEXT: v_and_b32_e64 v12, s14, 15 +; GFX9-DL-NEXT: s_lshr_b32 s15, s4, 20 +; GFX9-DL-NEXT: s_lshr_b32 s6, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s13, s4, 24 +; GFX9-DL-NEXT: v_and_b32_e64 v11, s15, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v12, v14, v12 +; GFX9-DL-NEXT: v_and_b32_e64 v6, s6, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v13, s13, 15 +; GFX9-DL-NEXT: v_lshl_or_b32 v11, v11, 16, v12 +; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-DL-NEXT: v_and_b32_e32 v6, v14, v6 +; GFX9-DL-NEXT: s_lshr_b32 s12, s4, 28 +; GFX9-DL-NEXT: v_and_b32_e32 v13, v14, v13 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v11 +; GFX9-DL-NEXT: v_lshl_or_b32 v13, s12, 16, v13 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, s5, 16, v6 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v13 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_add_u32_e32 v5, v3, v5 +; GFX9-DL-NEXT: v_add_u32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v6 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off @@ -2189,51 +2223,72 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s5, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40008 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s1, s6 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s7, s0 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s8, s1 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-DL-NEXT: v_and_b32_e64 v4, s2, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v5, s4, 15 +; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 4 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 4 +; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 8 +; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 8 +; GFX10-DL-NEXT: v_and_b32_e32 v5, v2, v5 +; GFX10-DL-NEXT: v_and_b32_e64 v6, s0, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v7, s1, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v4, v2, v4 +; GFX10-DL-NEXT: v_and_b32_e64 v8, s5, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v9, s6, 15 +; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 12 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 12 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v6, 16, v4 +; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 16 +; GFX10-DL-NEXT: v_and_b32_e64 v6, s0, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v7, s1, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v9, v2, v9 +; GFX10-DL-NEXT: v_and_b32_e32 v8, v2, v8 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 16 +; GFX10-DL-NEXT: v_and_b32_e64 v11, s6, 15 +; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 20 +; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v9 +; GFX10-DL-NEXT: v_and_b32_e64 v5, s5, 15 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 20 +; GFX10-DL-NEXT: v_and_b32_e64 v8, s0, 15 +; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v5, v2, v5 +; GFX10-DL-NEXT: v_and_b32_e64 v9, s1, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v10, v2, v11 +; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 24 +; GFX10-DL-NEXT: v_and_b32_e64 v7, s5, 15 +; GFX10-DL-NEXT: v_lshl_or_b32 v15, v8, 16, v5 +; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 28 +; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v10 +; GFX10-DL-NEXT: v_and_b32_e64 v11, s6, 15 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 28 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v15, v9 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s1, s5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s4 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_and_b32_e32 v4, v2, v11 +; GFX10-DL-NEXT: v_and_b32_e32 v2, v2, v7 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, s1, 16, v4 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, s0, 16, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v5 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2426,57 +2481,62 @@ ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40018 -; GFX9-NEXT: s_lshr_b32 s14, s1, 28 -; GFX9-NEXT: s_and_b32 s15, s1, 15 -; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-NEXT: s_bfe_u32 s17, s1, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-NEXT: s_lshr_b32 s7, s0, 28 -; GFX9-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-NEXT: s_and_b32 s8, s0, 15 -; GFX9-NEXT: v_mov_b32_e32 v7, s15 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v10, s1 -; GFX9-NEXT: v_mul_lo_u16_e32 v3, s4, v3 -; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v5, s6, v5 -; GFX9-NEXT: v_mul_lo_u16_sdwa v6, s7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v7, s8, v7 -; GFX9-NEXT: v_mul_lo_u16_sdwa v8, s9, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX9-NEXT: v_mul_lo_u16_e32 v9, s10, v9 -; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v6 +; GFX9-NEXT: s_lshr_b32 s8, s0, 4 +; GFX9-NEXT: s_lshr_b32 s15, s1, 4 +; GFX9-NEXT: s_lshr_b32 s9, s0, 8 +; GFX9-NEXT: s_lshr_b32 s10, s0, 12 +; GFX9-NEXT: s_lshr_b32 s16, s1, 8 +; GFX9-NEXT: s_lshr_b32 s17, s1, 12 +; GFX9-NEXT: v_and_b32_e64 v3, s0, 15 +; GFX9-NEXT: v_and_b32_e64 v4, s1, 15 +; GFX9-NEXT: v_and_b32_e64 v7, s8, 15 +; GFX9-NEXT: v_and_b32_e64 v13, s15, 15 +; GFX9-NEXT: v_and_b32_e64 v5, s10, 15 +; GFX9-NEXT: v_and_b32_e64 v11, s17, 15 +; GFX9-NEXT: v_and_b32_e64 v6, s9, 15 +; GFX9-NEXT: v_and_b32_e64 v12, s16, 15 +; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX9-NEXT: v_mul_lo_u16_e32 v6, v6, v12 +; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v5, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshr_b32 s5, s0, 20 +; GFX9-NEXT: s_lshr_b32 s11, s1, 16 +; GFX9-NEXT: s_lshr_b32 s12, s1, 20 ; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX9-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_lshr_b32 s7, s0, 24 +; GFX9-NEXT: s_lshr_b32 s13, s1, 28 +; GFX9-NEXT: s_lshr_b32 s14, s1, 24 +; GFX9-NEXT: v_or_b32_e32 v5, v3, v5 +; GFX9-NEXT: v_and_b32_e64 v9, s5, 15 +; GFX9-NEXT: v_and_b32_e64 v15, s12, 15 +; GFX9-NEXT: v_and_b32_e64 v10, s4, 15 +; GFX9-NEXT: v_and_b32_e64 v16, s11, 15 +; GFX9-NEXT: s_lshr_b32 s6, s0, 28 +; GFX9-NEXT: v_and_b32_e64 v8, s7, 15 +; GFX9-NEXT: v_and_b32_e64 v14, s14, 15 +; GFX9-NEXT: v_mov_b32_e32 v17, s13 +; GFX9-NEXT: v_mul_lo_u16_e32 v4, v10, v16 +; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s6, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v14 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-NEXT: v_or_b32_e32 v6, v4, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v7 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v4 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v6 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2492,57 +2552,62 @@ ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 28 -; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s17, s1, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s15 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s1 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s4, v3 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s6, v5 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, s7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, s8, v7 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, s9, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s10, v9 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-DL-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v6 +; GFX9-DL-NEXT: s_lshr_b32 s8, s0, 4 +; GFX9-DL-NEXT: s_lshr_b32 s15, s1, 4 +; GFX9-DL-NEXT: s_lshr_b32 s9, s0, 8 +; GFX9-DL-NEXT: s_lshr_b32 s10, s0, 12 +; GFX9-DL-NEXT: s_lshr_b32 s16, s1, 8 +; GFX9-DL-NEXT: s_lshr_b32 s17, s1, 12 +; GFX9-DL-NEXT: v_and_b32_e64 v3, s0, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v4, s1, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v7, s8, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v13, s15, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v5, s10, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v11, s17, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v6, s9, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v12, s16, 15 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v4 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v6, v6, v12 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, v5, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 20 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s12, s1, 20 ; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX9-DL-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 24 +; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 28 +; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 24 +; GFX9-DL-NEXT: v_or_b32_e32 v5, v3, v5 +; GFX9-DL-NEXT: v_and_b32_e64 v9, s5, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v15, s12, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v10, s4, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v16, s11, 15 +; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 28 +; GFX9-DL-NEXT: v_and_b32_e64 v8, s7, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v14, s14, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v17, s13 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v4, v10, v16 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; GFX9-DL-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s6, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v14 +; GFX9-DL-NEXT: v_or_b32_sdwa v8, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-DL-NEXT: v_or_b32_e32 v6, v4, v8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v7 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v6 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2560,51 +2625,63 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s6, s4, 15 -; GFX10-DL-NEXT: s_and_b32 s8, s5, 15 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s9, s5, 0x4000c -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s8 -; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s7, s9 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 4 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 4 +; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 12 +; GFX10-DL-NEXT: s_lshr_b32 s7, s5, 12 +; GFX10-DL-NEXT: s_lshr_b32 s8, s4, 8 +; GFX10-DL-NEXT: v_and_b32_e64 v4, s0, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v5, s1, 15 +; GFX10-DL-NEXT: s_lshr_b32 s0, s5, 8 +; GFX10-DL-NEXT: v_and_b32_e64 v6, s6, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v8, s7, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v7, s4, 15 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v5 +; GFX10-DL-NEXT: v_and_b32_e64 v9, s5, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v10, s0, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v5, s8, 15 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, v6, v8 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 28 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s1 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v9 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 20 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 20 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v10 ; GFX10-DL-NEXT: v_and_b32_sdwa v6, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: s_bfe_u32 s0, s5, 0x40014 -; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 28 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 -; GFX10-DL-NEXT: v_or_b32_sdwa v5, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 16 +; GFX10-DL-NEXT: s_lshr_b32 s7, s5, 16 +; GFX10-DL-NEXT: v_and_b32_e64 v7, s0, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v8, s1, 15 ; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s6, s0 -; GFX10-DL-NEXT: s_bfe_u32 s8, s5, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s5, 0x40018 +; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s4, s5, 24 +; GFX10-DL-NEXT: s_lshr_b32 s5, s5, 28 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v8 +; GFX10-DL-NEXT: v_and_b32_e64 v9, s6, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v10, s7, 15 ; GFX10-DL-NEXT: v_or_b32_e32 v5, v4, v5 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s1, s8 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, s7, s9 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v9, s0, s4 +; GFX10-DL-NEXT: v_and_b32_e64 v6, s0, 15 +; GFX10-DL-NEXT: v_and_b32_e64 v11, s4, 15 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, s1, s5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v9, v9, v10 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v5 -; GFX10-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v6, s2, v6 -; GFX10-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_or_b32_e32 v2, v6, v7 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v7, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v6, v11 +; GFX10-DL-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v10 +; GFX10-DL-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v6 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v14 +; GFX10-DL-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off Index: llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -227,18 +227,16 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 44 -; VI-NEXT: v_mov_b32_e32 v1, 3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s1, s0, 0xffff -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_add_i32 s1, s1, 12 -; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; VI-NEXT: s_or_b32 s0, s1, 4 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: v_or_b32_e32 v2, s0, v0 +; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; VI-NEXT: v_or_b32_e64 v1, s1, 4 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v2, 0x300, v0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: flat_store_short v[0:1], v2 Index: llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll =================================================================== --- llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll +++ llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll @@ -155,7 +155,7 @@ ; CHECK-LABEL: rotr_i64: ; CHECK: # %bb.0: ; CHECK-NEXT: neg 4, 4 -; CHECK-NEXT: rldcl 3, 3, 4, 0 +; CHECK-NEXT: rotld 3, 3, 4 ; CHECK-NEXT: blr %f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 %z) ret i64 %f Index: llvm/test/CodeGen/SystemZ/scalar-ctlz.ll =================================================================== --- llvm/test/CodeGen/SystemZ/scalar-ctlz.ll +++ llvm/test/CodeGen/SystemZ/scalar-ctlz.ll @@ -1,6 +1,4 @@ ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -; -; FIXME: two consecutive immediate adds not fused in i16/i8 functions. declare i64 @llvm.ctlz.i64(i64, i1) declare i32 @llvm.ctlz.i32(i32, i1) @@ -31,8 +29,7 @@ ; CHECK-LABEL: %bb.0: ; CHECK-NEXT: llgfr %r0, %r2 ; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 -; CHECK-NEXT: # kill +; CHECK-NEXT: ahi %r2, -32 ; CHECK-NEXT: br %r14 %1 = tail call i32 @llvm.ctlz.i32(i32 %arg, i1 false) ret i32 %1 @@ -43,8 +40,7 @@ ; CHECK-LABEL: %bb.0: ; CHECK-NEXT: llgfr %r0, %r2 ; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 -; CHECK-NEXT: # kill +; CHECK-NEXT: ahi %r2, -32 ; CHECK-NEXT: br %r14 %1 = tail call i32 @llvm.ctlz.i32(i32 %arg, i1 true) ret i32 %1 @@ -55,9 +51,8 @@ ; CHECK-LABEL: %bb.0: ; CHECK-NEXT: # kill ; CHECK-NEXT: llghr %r0, %r2 -; CHECK-NEXT: flogr %r0, %r0 -; CHECK-NEXT: aghi %r0, -32 -; CHECK-NEXT: ahik %r2, %r0, -16 +; CHECK-NEXT: flogr %r2, %r0 +; CHECK-NEXT: ahi %r2, -48 ; CHECK-NEXT: br %r14 %1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 false) ret i16 %1 @@ -68,9 +63,8 @@ ; CHECK-LABEL: %bb.0: ; CHECK-NEXT: # kill ; CHECK-NEXT: llghr %r0, %r2 -; CHECK-NEXT: flogr %r0, %r0 -; CHECK-NEXT: aghi %r0, -32 -; CHECK-NEXT: ahik %r2, %r0, -16 +; CHECK-NEXT: flogr %r2, %r0 +; CHECK-NEXT: ahi %r2, -48 ; CHECK-NEXT: br %r14 %1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 true) ret i16 %1 @@ -81,9 +75,8 @@ ; CHECK-LABEL: %bb.0: ; CHECK-NEXT: # kill ; CHECK-NEXT: llgcr %r0, %r2 -; CHECK-NEXT: flogr %r0, %r0 -; CHECK-NEXT: aghi %r0, -32 -; CHECK-NEXT: ahik %r2, %r0, -24 +; CHECK-NEXT: flogr %r2, %r0 +; CHECK-NEXT: ahi %r2, -56 ; CHECK-NEXT: br %r14 %1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 false) ret i8 %1 @@ -94,9 +87,8 @@ ; CHECK-LABEL: %bb.0: ; CHECK-NEXT: # kill ; CHECK-NEXT: llgcr %r0, %r2 -; CHECK-NEXT: flogr %r0, %r0 -; CHECK-NEXT: aghi %r0, -32 -; CHECK-NEXT: ahik %r2, %r0, -24 +; CHECK-NEXT: flogr %r2, %r0 +; CHECK-NEXT: ahi %r2, -56 ; CHECK-NEXT: br %r14 %1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 true) ret i8 %1 Index: llvm/test/CodeGen/X86/and-encoding.ll =================================================================== --- llvm/test/CodeGen/X86/and-encoding.ll +++ llvm/test/CodeGen/X86/and-encoding.ll @@ -22,7 +22,7 @@ define void @f2(i16 %x, i1 *%y) nounwind { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: -; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: andb $1, %dil # encoding: [0x40,0x80,0xe7,0x01] ; CHECK-NEXT: movb %dil, (%rsi) # encoding: [0x40,0x88,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] %c = trunc i16 %x to i1 @@ -33,7 +33,7 @@ define void @f3(i32 %x, i1 *%y) nounwind { ; CHECK-LABEL: f3: ; CHECK: # %bb.0: -; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: andb $1, %dil # encoding: [0x40,0x80,0xe7,0x01] ; CHECK-NEXT: movb %dil, (%rsi) # encoding: [0x40,0x88,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] %c = trunc i32 %x to i1 Index: llvm/test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-mask-op.ll +++ llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1819,15 +1819,15 @@ define void @store_i16_i1(i16 %x, i1 *%y) { ; CHECK-LABEL: store_i16_i1: ; CHECK: ## %bb.0: -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: andb $1, %dil ; CHECK-NEXT: movb %dil, (%rsi) ; CHECK-NEXT: retq ; ; X86-LABEL: store_i16_i1: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl $1, %ecx +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: andb $1, %cl ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: retl %c = trunc i16 %x to i1 @@ -1838,7 +1838,7 @@ define void @store_i8_i1(i8 %x, i1 *%y) { ; CHECK-LABEL: store_i8_i1: ; CHECK: ## %bb.0: -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: andb $1, %dil ; CHECK-NEXT: movb %dil, (%rsi) ; CHECK-NEXT: retq ; Index: llvm/test/CodeGen/X86/bool-math.ll =================================================================== --- llvm/test/CodeGen/X86/bool-math.ll +++ llvm/test/CodeGen/X86/bool-math.ll @@ -266,7 +266,7 @@ ; X64-NEXT: shrq $32, %rdi ; X64-NEXT: shrq $32, %rax ; X64-NEXT: xorl %edi, %eax -; X64-NEXT: andl $1, %eax +; X64-NEXT: andb $1, %al ; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq ; Index: llvm/test/CodeGen/X86/clz.ll =================================================================== --- llvm/test/CodeGen/X86/clz.ll +++ llvm/test/CodeGen/X86/clz.ll @@ -143,7 +143,7 @@ ; X32: # %bb.0: ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: bsrl %eax, %eax -; X32-NEXT: xorl $7, %eax +; X32-NEXT: xorb $7, %al ; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retl ; @@ -151,7 +151,7 @@ ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: bsrl %eax, %eax -; X64-NEXT: xorl $7, %eax +; X64-NEXT: xorb $7, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -159,7 +159,7 @@ ; X32-CLZ: # %bb.0: ; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-CLZ-NEXT: lzcntl %eax, %eax -; X32-CLZ-NEXT: addl $-24, %eax +; X32-CLZ-NEXT: addb $-24, %al ; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X32-CLZ-NEXT: retl ; @@ -167,7 +167,7 @@ ; X64-CLZ: # %bb.0: ; X64-CLZ-NEXT: movzbl %dil, %eax ; X64-CLZ-NEXT: lzcntl %eax, %eax -; X64-CLZ-NEXT: addl $-24, %eax +; X64-CLZ-NEXT: addb $-24, %al ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq %tmp2 = call i8 @llvm.ctlz.i8( i8 %x, i1 true ) @@ -285,7 +285,7 @@ ; X32-NEXT: # %bb.2: # %cond.false ; X32-NEXT: movzbl %al, %eax ; X32-NEXT: bsrl %eax, %eax -; X32-NEXT: xorl $7, %eax +; X32-NEXT: xorb $7, %al ; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retl ; X32-NEXT: .LBB8_1: @@ -300,7 +300,7 @@ ; X64-NEXT: # %bb.2: # %cond.false ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: bsrl %eax, %eax -; X64-NEXT: xorl $7, %eax +; X64-NEXT: xorb $7, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; X64-NEXT: .LBB8_1: @@ -312,7 +312,7 @@ ; X32-CLZ: # %bb.0: ; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-CLZ-NEXT: lzcntl %eax, %eax -; X32-CLZ-NEXT: addl $-24, %eax +; X32-CLZ-NEXT: addb $-24, %al ; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X32-CLZ-NEXT: retl ; @@ -320,7 +320,7 @@ ; X64-CLZ: # %bb.0: ; X64-CLZ-NEXT: movzbl %dil, %eax ; X64-CLZ-NEXT: lzcntl %eax, %eax -; X64-CLZ-NEXT: addl $-24, %eax +; X64-CLZ-NEXT: addb $-24, %al ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq %tmp1 = call i8 @llvm.ctlz.i8(i8 %n, i1 false) @@ -826,7 +826,7 @@ ; X32-NEXT: orb $64, %al ; X32-NEXT: movzbl %al, %eax ; X32-NEXT: bsrl %eax, %eax -; X32-NEXT: xorl $7, %eax +; X32-NEXT: xorb $7, %al ; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retl ; @@ -835,7 +835,7 @@ ; X64-NEXT: orb $64, %dil ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: bsrl %eax, %eax -; X64-NEXT: xorl $7, %eax +; X64-NEXT: xorb $7, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -845,7 +845,7 @@ ; X32-CLZ-NEXT: orb $64, %al ; X32-CLZ-NEXT: movzbl %al, %eax ; X32-CLZ-NEXT: lzcntl %eax, %eax -; X32-CLZ-NEXT: addl $-24, %eax +; X32-CLZ-NEXT: addb $-24, %al ; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X32-CLZ-NEXT: retl ; @@ -854,7 +854,7 @@ ; X64-CLZ-NEXT: orb $64, %dil ; X64-CLZ-NEXT: movzbl %dil, %eax ; X64-CLZ-NEXT: lzcntl %eax, %eax -; X64-CLZ-NEXT: addl $-24, %eax +; X64-CLZ-NEXT: addb $-24, %al ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq Index: llvm/test/CodeGen/X86/fast-isel-cmp.ll =================================================================== --- llvm/test/CodeGen/X86/fast-isel-cmp.ll +++ llvm/test/CodeGen/X86/fast-isel-cmp.ll @@ -9,7 +9,7 @@ ; SDAG: ## %bb.0: ; SDAG-NEXT: cmpeqss %xmm1, %xmm0 ; SDAG-NEXT: movd %xmm0, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; @@ -353,7 +353,7 @@ ; SDAG: ## %bb.0: ; SDAG-NEXT: cmpneqss %xmm1, %xmm0 ; SDAG-NEXT: movd %xmm0, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; @@ -593,7 +593,7 @@ ; SDAG-NEXT: xorps %xmm1, %xmm1 ; SDAG-NEXT: cmpeqss %xmm0, %xmm1 ; SDAG-NEXT: movd %xmm1, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; @@ -1248,7 +1248,7 @@ ; SDAG-NEXT: xorps %xmm1, %xmm1 ; SDAG-NEXT: cmpneqss %xmm0, %xmm1 ; SDAG-NEXT: movd %xmm1, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; Index: llvm/test/CodeGen/X86/funnel-shift.ll =================================================================== --- llvm/test/CodeGen/X86/funnel-shift.ll +++ llvm/test/CodeGen/X86/funnel-shift.ll @@ -107,9 +107,8 @@ ; X64-AVX2-NEXT: movq %rdi, %rax ; X64-AVX2-NEXT: movl %r8d, %ecx ; X64-AVX2-NEXT: shlq %cl, %rax -; X64-AVX2-NEXT: movl $37, %ecx -; X64-AVX2-NEXT: subl %r8d, %ecx -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-AVX2-NEXT: movb $37, %cl +; X64-AVX2-NEXT: subb %r8b, %cl ; X64-AVX2-NEXT: shrq %cl, %rsi ; X64-AVX2-NEXT: orq %rax, %rsi ; X64-AVX2-NEXT: testq %r8, %r8 @@ -293,9 +292,8 @@ ; X64-AVX2-NEXT: subq %rax, %r8 ; X64-AVX2-NEXT: movl %r8d, %ecx ; X64-AVX2-NEXT: shrq %cl, %r9 -; X64-AVX2-NEXT: movl $37, %ecx -; X64-AVX2-NEXT: subl %r8d, %ecx -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-AVX2-NEXT: movb $37, %cl +; X64-AVX2-NEXT: subb %r8b, %cl ; X64-AVX2-NEXT: shlq %cl, %rdi ; X64-AVX2-NEXT: orq %r9, %rdi ; X64-AVX2-NEXT: testq %r8, %r8 @@ -382,16 +380,15 @@ ; X32-SSE2-LABEL: fshl_i32_undef0_msk: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: andl $7, %ecx -; X32-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl +; X32-SSE2-NEXT: andb $7, %cl ; X32-SSE2-NEXT: shldl %cl, %eax, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshl_i32_undef0_msk: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: movl %esi, %ecx -; X64-AVX2-NEXT: andl $7, %ecx +; X64-AVX2-NEXT: andb $7, %cl ; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-AVX2-NEXT: shldl %cl, %edi, %eax ; X64-AVX2-NEXT: retq @@ -569,16 +566,15 @@ ; X32-SSE2-LABEL: fshr_i32_undef1_msk: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: andl $7, %ecx -; X32-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl +; X32-SSE2-NEXT: andb $7, %cl ; X32-SSE2-NEXT: shrdl %cl, %eax, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshr_i32_undef1_msk: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: movl %esi, %ecx -; X64-AVX2-NEXT: andl $7, %ecx +; X64-AVX2-NEXT: andb $7, %cl ; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-AVX2-NEXT: shrdl %cl, %edi, %eax ; X64-AVX2-NEXT: retq Index: llvm/test/CodeGen/X86/lzcnt.ll =================================================================== --- llvm/test/CodeGen/X86/lzcnt.ll +++ llvm/test/CodeGen/X86/lzcnt.ll @@ -13,7 +13,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: lzcntl %eax, %eax -; X86-NEXT: addl $-24, %eax +; X86-NEXT: addb $-24, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -21,7 +21,7 @@ ; X32: # %bb.0: ; X32-NEXT: movzbl %dil, %eax ; X32-NEXT: lzcntl %eax, %eax -; X32-NEXT: addl $-24, %eax +; X32-NEXT: addb $-24, %al ; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retq ; @@ -29,7 +29,7 @@ ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: lzcntl %eax, %eax -; X64-NEXT: addl $-24, %eax +; X64-NEXT: addb $-24, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %tmp = tail call i8 @llvm.ctlz.i8( i8 %x, i1 false ) @@ -108,7 +108,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: lzcntl %eax, %eax -; X86-NEXT: addl $-24, %eax +; X86-NEXT: addb $-24, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -116,7 +116,7 @@ ; X32: # %bb.0: ; X32-NEXT: movzbl %dil, %eax ; X32-NEXT: lzcntl %eax, %eax -; X32-NEXT: addl $-24, %eax +; X32-NEXT: addb $-24, %al ; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retq ; @@ -124,7 +124,7 @@ ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: lzcntl %eax, %eax -; X64-NEXT: addl $-24, %eax +; X64-NEXT: addb $-24, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %tmp = tail call i8 @llvm.ctlz.i8( i8 %x, i1 true ) Index: llvm/test/CodeGen/X86/masked_store_trunc.ll =================================================================== --- llvm/test/CodeGen/X86/masked_store_trunc.ll +++ llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -394,7 +394,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %else @@ -466,7 +466,7 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB1_1 ; AVX2-NEXT: # %bb.2: # %else @@ -784,7 +784,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %else @@ -860,7 +860,7 @@ ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1012,7 +1012,7 @@ ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1051,7 +1051,7 @@ ; SSE4-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE4-NEXT: movmskps %xmm3, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB3_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1152,7 +1152,7 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB4_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1196,7 +1196,7 @@ ; SSE4-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE4-NEXT: movmskps %xmm3, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB4_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1237,7 +1237,7 @@ ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB4_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1280,7 +1280,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB4_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1386,7 +1386,7 @@ ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: jne .LBB5_1 @@ -1429,7 +1429,7 @@ ; SSE4-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE4-NEXT: movmskps %xmm3, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB5_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1469,7 +1469,7 @@ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB5_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1511,7 +1511,7 @@ ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB5_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1614,7 +1614,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB6_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1637,7 +1637,7 @@ ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE4-NEXT: movmskpd %xmm2, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB6_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1718,7 +1718,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB7_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1743,7 +1743,7 @@ ; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE4-NEXT: movmskpd %xmm2, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB7_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1766,7 +1766,7 @@ ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB7_1 ; AVX-NEXT: # %bb.2: # %else @@ -1841,7 +1841,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB8_1 @@ -1864,7 +1864,7 @@ ; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE4-NEXT: movmskpd %xmm2, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB8_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1886,7 +1886,7 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB8_1 ; AVX-NEXT: # %bb.2: # %else @@ -3492,7 +3492,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB11_1 ; AVX1-NEXT: # %bb.2: # %else @@ -3559,7 +3559,7 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB11_1 ; AVX2-NEXT: # %bb.2: # %else @@ -3869,7 +3869,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB12_1 ; AVX1-NEXT: # %bb.2: # %else @@ -3939,7 +3939,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB12_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4098,7 +4098,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB13_1 ; SSE2-NEXT: # %bb.2: # %else @@ -4138,7 +4138,7 @@ ; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB13_1 ; SSE4-NEXT: # %bb.2: # %else @@ -4174,7 +4174,7 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskps %xmm1, %eax -; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: xorb $15, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB13_1 ; AVX-NEXT: # %bb.2: # %else @@ -4272,7 +4272,7 @@ ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: jne .LBB14_1 @@ -4312,7 +4312,7 @@ ; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB14_1 ; SSE4-NEXT: # %bb.2: # %else @@ -4348,7 +4348,7 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskps %xmm1, %eax -; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: xorb $15, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB14_1 ; AVX-NEXT: # %bb.2: # %else Index: llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll =================================================================== --- llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -710,7 +710,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %else @@ -789,7 +789,7 @@ ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB1_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1282,7 +1282,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1368,7 +1368,7 @@ ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1586,7 +1586,7 @@ ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm9 ; SSE2-NEXT: movmskps %xmm9, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1642,7 +1642,7 @@ ; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 ; SSE4-NEXT: movmskps %xmm4, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB3_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1829,7 +1829,7 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm9 ; SSE2-NEXT: movmskps %xmm9, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB4_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1890,7 +1890,7 @@ ; SSE4-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 ; SSE4-NEXT: movmskps %xmm4, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB4_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1941,7 +1941,7 @@ ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB4_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1990,7 +1990,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB4_1 ; AVX2-NEXT: # %bb.2: # %else @@ -2166,7 +2166,7 @@ ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm9 ; SSE2-NEXT: movmskps %xmm9, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: jne .LBB5_1 @@ -2226,7 +2226,7 @@ ; SSE4-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 ; SSE4-NEXT: movmskps %xmm4, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB5_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2276,7 +2276,7 @@ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB5_1 ; AVX1-NEXT: # %bb.2: # %else @@ -2324,7 +2324,7 @@ ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB5_1 ; AVX2-NEXT: # %bb.2: # %else @@ -2469,7 +2469,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB6_1 ; SSE2-NEXT: # %bb.2: # %else @@ -2501,7 +2501,7 @@ ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB6_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2638,7 +2638,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB7_1 ; SSE2-NEXT: # %bb.2: # %else @@ -2672,7 +2672,7 @@ ; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB7_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2701,7 +2701,7 @@ ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB7_1 ; AVX-NEXT: # %bb.2: # %else @@ -2820,7 +2820,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm3, %ecx ; SSE2-NEXT: jne .LBB8_1 @@ -2852,7 +2852,7 @@ ; SSE4-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB8_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2880,7 +2880,7 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB8_1 ; AVX-NEXT: # %bb.2: # %else @@ -4477,7 +4477,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB11_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4544,7 +4544,7 @@ ; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB11_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4859,7 +4859,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB12_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4927,7 +4927,7 @@ ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB12_1 ; AVX2-NEXT: # %bb.2: # %else @@ -5096,7 +5096,7 @@ ; SSE2-NEXT: packssdw %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB13_1 ; SSE2-NEXT: # %bb.2: # %else @@ -5136,7 +5136,7 @@ ; SSE4-NEXT: packssdw %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB13_1 ; SSE4-NEXT: # %bb.2: # %else @@ -5172,7 +5172,7 @@ ; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskps %xmm1, %eax -; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: xorb $15, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB13_1 ; AVX-NEXT: # %bb.2: # %else @@ -5296,7 +5296,7 @@ ; SSE2-NEXT: packuswb %xmm3, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: jne .LBB14_1 @@ -5338,7 +5338,7 @@ ; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB14_1 ; SSE4-NEXT: # %bb.2: # %else @@ -5376,7 +5376,7 @@ ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB14_1 ; AVX1-NEXT: # %bb.2: # %else @@ -5416,7 +5416,7 @@ ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB14_1 ; AVX2-NEXT: # %bb.2: # %else Index: llvm/test/CodeGen/X86/masked_store_trunc_usat.ll =================================================================== --- llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -584,7 +584,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %else @@ -662,7 +662,7 @@ ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB1_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1066,7 +1066,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1151,7 +1151,7 @@ ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1336,7 +1336,7 @@ ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1389,7 +1389,7 @@ ; SSE4-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm6 ; SSE4-NEXT: movmskps %xmm6, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB3_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1540,7 +1540,7 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB4_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1598,7 +1598,7 @@ ; SSE4-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm8 ; SSE4-NEXT: movmskps %xmm8, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB4_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1648,7 +1648,7 @@ ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB4_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1697,7 +1697,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB4_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1837,7 +1837,7 @@ ; SSE2-NEXT: packuswb %xmm4, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm9 ; SSE2-NEXT: movmskps %xmm9, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm4, %eax ; SSE2-NEXT: jne .LBB5_1 @@ -1894,7 +1894,7 @@ ; SSE4-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm8 ; SSE4-NEXT: movmskps %xmm8, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB5_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1943,7 +1943,7 @@ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB5_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1991,7 +1991,7 @@ ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB5_1 ; AVX2-NEXT: # %bb.2: # %else @@ -2115,7 +2115,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB6_1 ; SSE2-NEXT: # %bb.2: # %else @@ -2145,7 +2145,7 @@ ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB6_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2259,7 +2259,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB7_1 ; SSE2-NEXT: # %bb.2: # %else @@ -2291,7 +2291,7 @@ ; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB7_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2319,7 +2319,7 @@ ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB7_1 ; AVX-NEXT: # %bb.2: # %else @@ -2417,7 +2417,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm4, %ecx ; SSE2-NEXT: jne .LBB8_1 @@ -2447,7 +2447,7 @@ ; SSE4-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm4 ; SSE4-NEXT: movmskpd %xmm4, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB8_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2474,7 +2474,7 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB8_1 ; AVX-NEXT: # %bb.2: # %else @@ -4154,7 +4154,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB11_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4223,7 +4223,7 @@ ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB11_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4556,7 +4556,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB12_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4628,7 +4628,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB12_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4799,7 +4799,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB13_1 ; SSE2-NEXT: # %bb.2: # %else @@ -4840,7 +4840,7 @@ ; SSE4-NEXT: packusdw %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB13_1 ; SSE4-NEXT: # %bb.2: # %else @@ -4877,7 +4877,7 @@ ; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB13_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4915,7 +4915,7 @@ ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB13_1 ; AVX2-NEXT: # %bb.2: # %else @@ -5027,7 +5027,7 @@ ; SSE2-NEXT: packuswb %xmm4, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm4, %eax ; SSE2-NEXT: jne .LBB14_1 @@ -5068,7 +5068,7 @@ ; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB14_1 ; SSE4-NEXT: # %bb.2: # %else @@ -5105,7 +5105,7 @@ ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB14_1 ; AVX1-NEXT: # %bb.2: # %else @@ -5143,7 +5143,7 @@ ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB14_1 ; AVX2-NEXT: # %bb.2: # %else Index: llvm/test/CodeGen/X86/movmsk-cmp.ll =================================================================== --- llvm/test/CodeGen/X86/movmsk-cmp.ll +++ llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -4464,8 +4464,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax -; SSE2-NEXT: cmpb $3, %al +; SSE2-NEXT: testb %al, %al ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -4473,8 +4472,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovmskpd %xmm0, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: cmpb $3, %al +; AVX-NEXT: testb %al, %al ; AVX-NEXT: sete %al ; AVX-NEXT: retq ; Index: llvm/test/CodeGen/X86/mul-constant-i8.ll =================================================================== --- llvm/test/CodeGen/X86/mul-constant-i8.ll +++ llvm/test/CodeGen/X86/mul-constant-i8.ll @@ -463,7 +463,7 @@ ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: addl %edi, %edi ; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: negl %eax +; X64-NEXT: negb %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, -10 @@ -476,7 +476,7 @@ ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: shll $2, %edi ; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: negl %eax +; X64-NEXT: negb %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, -36 Index: llvm/test/CodeGen/X86/pr15267.ll =================================================================== --- llvm/test/CodeGen/X86/pr15267.ll +++ llvm/test/CodeGen/X86/pr15267.ll @@ -73,62 +73,75 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq (%rdi), %rax ; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: shrl $4, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: shrb $4, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: andl $15, %edx +; CHECK-NEXT: andb $15, %dl +; CHECK-NEXT: movzbl %dl, %edx ; CHECK-NEXT: vmovd %edx, %xmm0 ; CHECK-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $8, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $12, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $16, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $20, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $24, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $28, %ecx ; CHECK-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $36, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $40, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $44, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $48, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $52, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $56, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: shrq $60, %rax ; CHECK-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 Index: llvm/test/CodeGen/X86/pr40539.ll =================================================================== --- llvm/test/CodeGen/X86/pr40539.ll +++ llvm/test/CodeGen/X86/pr40539.ll @@ -18,7 +18,7 @@ ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cmpeqss (%esp), %xmm0 ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: popl %ecx ; CHECK-NEXT: .cfi_def_cfa_offset 4 Index: llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll =================================================================== --- llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll +++ llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll @@ -15,7 +15,7 @@ ; ; CHECK32-LABEL: f32_bzhi: ; CHECK32: # %bb.0: # %entry -; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al ; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax ; CHECK32-NEXT: retl entry: @@ -34,7 +34,7 @@ ; ; CHECK32-LABEL: f32_bzhi_partial: ; CHECK32: # %bb.0: # %entry -; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al ; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax ; CHECK32-NEXT: retl entry: Index: llvm/test/CodeGen/X86/shift-double-x86_64.ll =================================================================== --- llvm/test/CodeGen/X86/shift-double-x86_64.ll +++ llvm/test/CodeGen/X86/shift-double-x86_64.ll @@ -8,7 +8,6 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdx, %rcx ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: andl $63, %ecx ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shldq %cl, %rsi, %rax ; CHECK-NEXT: retq @@ -25,7 +24,6 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdx, %rcx ; CHECK-NEXT: movq %rsi, %rax -; CHECK-NEXT: andl $63, %ecx ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shrdq %cl, %rdi, %rax ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/shift-double.ll =================================================================== --- llvm/test/CodeGen/X86/shift-double.ll +++ llvm/test/CodeGen/X86/shift-double.ll @@ -290,11 +290,9 @@ define i32 @test11(i32 %hi, i32 %lo, i32 %bits) nounwind { ; X86-LABEL: test11: ; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl $31, %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shldl %cl, %edx, %eax ; X86-NEXT: retl ; @@ -302,7 +300,6 @@ ; X64: # %bb.0: ; X64-NEXT: movl %edx, %ecx ; X64-NEXT: movl %edi, %eax -; X64-NEXT: andl $31, %ecx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shldl %cl, %esi, %eax ; X64-NEXT: retq @@ -317,11 +314,9 @@ define i32 @test12(i32 %hi, i32 %lo, i32 %bits) nounwind { ; X86-LABEL: test12: ; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl $31, %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shrdl %cl, %edx, %eax ; X86-NEXT: retl ; @@ -329,7 +324,6 @@ ; X64: # %bb.0: ; X64-NEXT: movl %edx, %ecx ; X64-NEXT: movl %esi, %eax -; X64-NEXT: andl $31, %ecx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrdl %cl, %edi, %eax ; X64-NEXT: retq Index: llvm/test/CodeGen/X86/vector-compare-all_of.ll =================================================================== --- llvm/test/CodeGen/X86/vector-compare-all_of.ll +++ llvm/test/CodeGen/X86/vector-compare-all_of.ll @@ -1103,8 +1103,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE-NEXT: movmskps %xmm0, %eax -; SSE-NEXT: xorl $15, %eax -; SSE-NEXT: cmpb $15, %al +; SSE-NEXT: testb %al, %al ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -1112,8 +1111,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovmskps %xmm0, %eax -; AVX-NEXT: xorl $15, %eax -; AVX-NEXT: cmpb $15, %al +; AVX-NEXT: testb %al, %al ; AVX-NEXT: sete %al ; AVX-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-compare-any_of.ll =================================================================== --- llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -1019,7 +1019,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE-NEXT: movmskps %xmm0, %eax -; SSE-NEXT: xorb $15, %al +; SSE-NEXT: cmpb $15, %al ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; @@ -1027,7 +1027,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovmskps %xmm0, %eax -; AVX-NEXT: xorb $15, %al +; AVX-NEXT: cmpb $15, %al ; AVX-NEXT: setne %al ; AVX-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -33,7 +33,7 @@ ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: andl $3, %eax +; AVX512F-NEXT: andb $3, %al ; AVX512F-NEXT: xorb $0, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper @@ -44,7 +44,7 @@ ; AVX512BW-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: andl $3, %eax +; AVX512BW-NEXT: andb $3, %al ; AVX512BW-NEXT: xorb $0, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper @@ -55,7 +55,7 @@ ; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: andl $3, %eax +; AVX512VL-NEXT: andb $3, %al ; AVX512VL-NEXT: xorb $0, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq @@ -86,7 +86,7 @@ ; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: andb $15, %al ; AVX512F-NEXT: xorb $0, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper @@ -97,7 +97,7 @@ ; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: andl $15, %eax +; AVX512BW-NEXT: andb $15, %al ; AVX512BW-NEXT: xorb $0, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper @@ -108,7 +108,7 @@ ; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: andb $15, %al ; AVX512VL-NEXT: xorb $0, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq @@ -243,7 +243,7 @@ ; AVX512F-NEXT: vpsllq $63, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: andb $15, %al ; AVX512F-NEXT: xorb $0, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper @@ -254,7 +254,7 @@ ; AVX512BW-NEXT: vpsllq $63, %ymm0, %ymm0 ; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: andl $15, %eax +; AVX512BW-NEXT: andb $15, %al ; AVX512BW-NEXT: xorb $0, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper @@ -265,7 +265,7 @@ ; AVX512VL-NEXT: vpsllq $63, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: andb $15, %al ; AVX512VL-NEXT: xorb $0, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper @@ -1087,7 +1087,7 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: andl $3, %eax +; AVX512F-NEXT: andb $3, %al ; AVX512F-NEXT: xorb $0, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper @@ -1098,7 +1098,7 @@ ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vptestnmq %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: andl $3, %eax +; AVX512BW-NEXT: andb $3, %al ; AVX512BW-NEXT: xorb $0, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper @@ -1108,7 +1108,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: andl $3, %eax +; AVX512VL-NEXT: andb $3, %al ; AVX512VL-NEXT: xorb $0, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq @@ -1141,7 +1141,7 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: andb $15, %al ; AVX512F-NEXT: xorb $0, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper @@ -1152,7 +1152,7 @@ ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: andl $15, %eax +; AVX512BW-NEXT: andb $15, %al ; AVX512BW-NEXT: xorb $0, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper @@ -1162,7 +1162,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: andb $15, %al ; AVX512VL-NEXT: xorb $0, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq @@ -1354,7 +1354,7 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: andb $15, %al ; AVX512F-NEXT: xorb $0, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper @@ -1365,7 +1365,7 @@ ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vptestnmq %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: andl $15, %eax +; AVX512BW-NEXT: andb $15, %al ; AVX512BW-NEXT: xorb $0, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper @@ -1375,7 +1375,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: andb $15, %al ; AVX512VL-NEXT: xorb $0, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper