Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -579,14 +579,6 @@ /// affected nodes are stored as a prefix in \p StoreNodes). bool MergeConsecutiveStores(StoreSDNode *St); - /// Try to transform a truncation where C is a constant: - /// (trunc (and X, C)) -> (and (trunc X), (trunc C)) - /// - /// \p N needs to be a truncation and its first operand an AND. Other - /// requirements are checked by the function (e.g. that trunc is - /// single-use) and if missed an empty SDValue is returned. - SDValue distributeTruncateThroughAnd(SDNode *N); - /// Helper function to determine whether the target supports operation /// given by \p Opcode for type \p VT, that is, whether the operation /// is legal or custom before legalizing operations, and whether is @@ -6464,29 +6456,6 @@ return DAG.getNode(LHS->getOpcode(), SDLoc(N), VT, NewShift, NewRHS); } -SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) { - assert(N->getOpcode() == ISD::TRUNCATE); - assert(N->getOperand(0).getOpcode() == ISD::AND); - - // (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC) - EVT TruncVT = N->getValueType(0); - if (N->hasOneUse() && N->getOperand(0).hasOneUse() && - TLI.isTypeDesirableForOp(ISD::AND, TruncVT)) { - SDValue N01 = N->getOperand(0).getOperand(1); - if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) { - SDLoc DL(N); - SDValue N00 = N->getOperand(0).getOperand(0); - SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00); - SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01); - AddToWorklist(Trunc00.getNode()); - AddToWorklist(Trunc01.getNode()); - return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01); - } - } - - return SDValue(); -} - SDValue DAGCombiner::visitRotate(SDNode *N) { SDLoc dl(N); SDValue N0 = N->getOperand(0); @@ -6514,13 +6483,6 @@ } } - // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))). - if (N1.getOpcode() == ISD::TRUNCATE && - N1.getOperand(0).getOpcode() == ISD::AND) { - if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) - return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1); - } - unsigned NextOp = N0.getOpcode(); // fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize) if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) { @@ -6592,12 +6554,6 @@ if (DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(OpSizeInBits))) return DAG.getConstant(0, SDLoc(N), VT); - // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))). - if (N1.getOpcode() == ISD::TRUNCATE && - N1.getOperand(0).getOpcode() == ISD::AND) { - if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) - return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1); - } if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); @@ -6883,13 +6839,6 @@ } } - // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))). - if (N1.getOpcode() == ISD::TRUNCATE && - N1.getOperand(0).getOpcode() == ISD::AND) { - if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) - return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1); - } - // fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2)) // if c1 is equal to the number of bits the trunc removes if (N0.getOpcode() == ISD::TRUNCATE && @@ -7087,13 +7036,6 @@ } } - // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))). - if (N1.getOpcode() == ISD::TRUNCATE && - N1.getOperand(0).getOpcode() == ISD::AND) { - if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) - return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1); - } - // fold operands of srl based on knowledge that the low bits are not // demanded. if (N1C && SimplifyDemandedBits(SDValue(N, 0))) @@ -10080,9 +10022,10 @@ case ISD::AND: case ISD::OR: case ISD::XOR: - if (!LegalOperations && N0.hasOneUse() && + if ((!LegalOperations || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) && (isConstantOrConstantVector(N0.getOperand(0), true) || - isConstantOrConstantVector(N0.getOperand(1), true))) { + isConstantOrConstantVector(N0.getOperand(1), true)) && + N0.hasOneUse()) { // TODO: We already restricted this to pre-legalization, but for vectors // we are extra cautious to not create an unsupported operation. // Target-specific changes are likely needed to avoid regressions here. Index: llvm/test/CodeGen/AMDGPU/idot8s.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot8s.ll +++ llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -1578,66 +1578,87 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e64 v0, s2, 15 +; GFX9-NEXT: v_and_b32_e64 v5, s4, 15 +; GFX9-NEXT: s_lshr_b32 s9, s2, 8 +; GFX9-NEXT: s_lshr_b32 s11, s2, 4 +; GFX9-NEXT: s_lshr_b32 s16, s4, 8 +; GFX9-NEXT: s_lshr_b32 s18, s4, 4 +; GFX9-NEXT: v_and_b32_e64 v3, s9, 15 +; GFX9-NEXT: v_and_b32_e64 v7, s16, 15 +; GFX9-NEXT: s_lshr_b32 s7, s2, 16 +; GFX9-NEXT: s_lshr_b32 s14, s4, 16 +; GFX9-NEXT: s_lshr_b32 s10, s2, 12 +; GFX9-NEXT: s_lshr_b32 s17, s4, 12 +; GFX9-NEXT: v_and_b32_e64 v1, s11, 15 +; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX9-NEXT: v_and_b32_e64 v6, s18, 15 +; GFX9-NEXT: v_and_b32_e32 v5, v2, v5 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX9-NEXT: v_and_b32_e64 v4, s7, 15 +; GFX9-NEXT: v_and_b32_e64 v8, s14, 15 +; GFX9-NEXT: s_lshr_b32 s8, s2, 20 +; GFX9-NEXT: s_lshr_b32 s15, s4, 20 +; GFX9-NEXT: v_and_b32_e64 v1, s10, 15 +; GFX9-NEXT: v_and_b32_e32 v3, v2, v3 +; GFX9-NEXT: v_and_b32_e64 v6, s17, 15 +; GFX9-NEXT: v_and_b32_e32 v7, v2, v7 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX9-NEXT: s_lshr_b32 s6, s2, 24 +; GFX9-NEXT: s_lshr_b32 s13, s4, 24 +; GFX9-NEXT: v_and_b32_e64 v3, s8, 15 +; GFX9-NEXT: v_and_b32_e32 v4, v2, v4 +; GFX9-NEXT: v_and_b32_e64 v7, s15, 15 +; GFX9-NEXT: v_and_b32_e32 v8, v2, v8 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX9-NEXT: v_and_b32_e64 v4, s6, 15 +; GFX9-NEXT: v_and_b32_e64 v8, s13, 15 +; GFX9-NEXT: v_and_b32_e32 v4, v2, v4 +; GFX9-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-NEXT: s_lshr_b32 s12, s4, 28 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX9-NEXT: v_lshl_or_b32 v4, s5, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v2, s12, 16, v2 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v4, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v1, v6 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s5, s4, 15 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s5, s6 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s5, s6 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40014 -; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s9, s10 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s13, s2 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x4000c -; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s7, s8 +; GFX9-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, v3 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 -; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s12 -; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s14, s4 -; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v9 -; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v6, v6, v10 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v6, v5, v6 +; GFX9-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1645,66 +1666,87 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_and_b32_e64 v0, s2, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v5, s4, 15 +; GFX9-DL-NEXT: s_lshr_b32 s9, s2, 8 +; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 4 +; GFX9-DL-NEXT: s_lshr_b32 s16, s4, 8 +; GFX9-DL-NEXT: s_lshr_b32 s18, s4, 4 +; GFX9-DL-NEXT: v_and_b32_e64 v3, s9, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v7, s16, 15 +; GFX9-DL-NEXT: s_lshr_b32 s7, s2, 16 +; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 16 +; GFX9-DL-NEXT: s_lshr_b32 s10, s2, 12 +; GFX9-DL-NEXT: s_lshr_b32 s17, s4, 12 +; GFX9-DL-NEXT: v_and_b32_e64 v1, s11, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX9-DL-NEXT: v_and_b32_e64 v6, s18, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v5, v2, v5 +; GFX9-DL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX9-DL-NEXT: v_and_b32_e64 v4, s7, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v8, s14, 15 +; GFX9-DL-NEXT: s_lshr_b32 s8, s2, 20 +; GFX9-DL-NEXT: s_lshr_b32 s15, s4, 20 +; GFX9-DL-NEXT: v_and_b32_e64 v1, s10, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v3, v2, v3 +; GFX9-DL-NEXT: v_and_b32_e64 v6, s17, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v7, v2, v7 +; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX9-DL-NEXT: s_lshr_b32 s6, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s13, s4, 24 +; GFX9-DL-NEXT: v_and_b32_e64 v3, s8, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v4, v2, v4 +; GFX9-DL-NEXT: v_and_b32_e64 v7, s15, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v8, v2, v8 +; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v4 +; GFX9-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX9-DL-NEXT: v_and_b32_e64 v4, s6, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v8, s13, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v4, v2, v4 +; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-DL-NEXT: s_lshr_b32 s12, s4, 28 +; GFX9-DL-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, s5, 16, v4 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, s12, 16, v2 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v1, v6 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v0, v5 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s5, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40014 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s9, s10 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s13, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x4000c -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s7, s8 +; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, v3 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v7 -; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s12 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s14, s4 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v9 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v6, v5, v6 +; GFX9-DL-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-DL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, Index: llvm/test/CodeGen/AMDGPU/idot8u.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot8u.ll +++ llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -1832,52 +1832,69 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v14, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s9, s2, 8 +; GFX9-NEXT: v_and_b32_e64 v3, s9, 15 +; GFX9-NEXT: s_lshr_b32 s10, s2, 12 +; GFX9-NEXT: v_and_b32_e64 v0, s2, 15 +; GFX9-NEXT: s_lshr_b32 s7, s2, 16 +; GFX9-NEXT: s_lshr_b32 s11, s2, 4 +; GFX9-NEXT: s_lshr_b32 s18, s4, 4 +; GFX9-NEXT: v_and_b32_e64 v2, s10, 15 +; GFX9-NEXT: v_and_b32_e32 v3, v14, v3 +; GFX9-NEXT: v_and_b32_e64 v7, s4, 15 +; GFX9-NEXT: v_and_b32_e64 v5, s7, 15 +; GFX9-NEXT: s_lshr_b32 s8, s2, 20 +; GFX9-NEXT: s_lshr_b32 s16, s4, 8 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX9-NEXT: v_and_b32_e64 v1, s11, 15 +; GFX9-NEXT: v_and_b32_e32 v0, v14, v0 +; GFX9-NEXT: v_and_b32_e64 v8, s18, 15 +; GFX9-NEXT: v_and_b32_e32 v3, v14, v7 +; GFX9-NEXT: s_lshr_b32 s17, s4, 12 +; GFX9-NEXT: v_and_b32_e64 v4, s8, 15 +; GFX9-NEXT: v_and_b32_e32 v5, v14, v5 +; GFX9-NEXT: v_and_b32_e64 v10, s16, 15 +; GFX9-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX9-NEXT: v_pk_mul_lo_u16 v3, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_and_b32_e64 v9, s17, 15 +; GFX9-NEXT: v_and_b32_e32 v5, v14, v10 +; GFX9-NEXT: v_lshl_or_b32 v5, v9, 16, v5 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v3, s0, v3 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s0, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, s1, v4 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, s5, v5 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: v_pk_mul_lo_u16 v6, s0, v6 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v5 +; GFX9-NEXT: global_load_ushort v5, v[0:1], off +; GFX9-NEXT: s_lshr_b32 s14, s4, 16 +; GFX9-NEXT: v_and_b32_e64 v12, s14, 15 +; GFX9-NEXT: s_lshr_b32 s15, s4, 20 +; GFX9-NEXT: s_lshr_b32 s6, s2, 24 +; GFX9-NEXT: s_lshr_b32 s13, s4, 24 +; GFX9-NEXT: v_and_b32_e64 v11, s15, 15 +; GFX9-NEXT: v_and_b32_e32 v12, v14, v12 +; GFX9-NEXT: v_and_b32_e64 v6, s6, 15 +; GFX9-NEXT: v_and_b32_e64 v13, s13, 15 +; GFX9-NEXT: v_lshl_or_b32 v11, v11, 16, v12 +; GFX9-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-NEXT: v_and_b32_e32 v6, v14, v6 +; GFX9-NEXT: s_lshr_b32 s12, s4, 28 +; GFX9-NEXT: v_and_b32_e32 v13, v14, v13 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v11 +; GFX9-NEXT: v_lshl_or_b32 v13, s12, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v6, s5, 16, v6 +; GFX9-NEXT: v_pk_mul_lo_u16 v6, v6, v13 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, v3, v5 +; GFX9-NEXT: v_add_u32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v[0:1], v2, off @@ -1887,52 +1904,69 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v14, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_lshr_b32 s9, s2, 8 +; GFX9-DL-NEXT: v_and_b32_e64 v3, s9, 15 +; GFX9-DL-NEXT: s_lshr_b32 s10, s2, 12 +; GFX9-DL-NEXT: v_and_b32_e64 v0, s2, 15 +; GFX9-DL-NEXT: s_lshr_b32 s7, s2, 16 +; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 4 +; GFX9-DL-NEXT: s_lshr_b32 s18, s4, 4 +; GFX9-DL-NEXT: v_and_b32_e64 v2, s10, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v3, v14, v3 +; GFX9-DL-NEXT: v_and_b32_e64 v7, s4, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v5, s7, 15 +; GFX9-DL-NEXT: s_lshr_b32 s8, s2, 20 +; GFX9-DL-NEXT: s_lshr_b32 s16, s4, 8 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX9-DL-NEXT: v_and_b32_e64 v1, s11, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v0, v14, v0 +; GFX9-DL-NEXT: v_and_b32_e64 v8, s18, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v3, v14, v7 +; GFX9-DL-NEXT: s_lshr_b32 s17, s4, 12 +; GFX9-DL-NEXT: v_and_b32_e64 v4, s8, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v5, v14, v5 +; GFX9-DL-NEXT: v_and_b32_e64 v10, s16, 15 +; GFX9-DL-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX9-DL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v0, v3 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_and_b32_e64 v9, s17, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v5, v14, v10 +; GFX9-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s0, v3 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s1, v4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s5, v5 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, s0, v6 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5 +; GFX9-DL-NEXT: global_load_ushort v5, v[0:1], off +; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 16 +; GFX9-DL-NEXT: v_and_b32_e64 v12, s14, 15 +; GFX9-DL-NEXT: s_lshr_b32 s15, s4, 20 +; GFX9-DL-NEXT: s_lshr_b32 s6, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s13, s4, 24 +; GFX9-DL-NEXT: v_and_b32_e64 v11, s15, 15 +; GFX9-DL-NEXT: v_and_b32_e32 v12, v14, v12 +; GFX9-DL-NEXT: v_and_b32_e64 v6, s6, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v13, s13, 15 +; GFX9-DL-NEXT: v_lshl_or_b32 v11, v11, 16, v12 +; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-DL-NEXT: v_and_b32_e32 v6, v14, v6 +; GFX9-DL-NEXT: s_lshr_b32 s12, s4, 28 +; GFX9-DL-NEXT: v_and_b32_e32 v13, v14, v13 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v11 +; GFX9-DL-NEXT: v_lshl_or_b32 v13, s12, 16, v13 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, s5, 16, v6 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v13 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_add_u32_e32 v5, v3, v5 +; GFX9-DL-NEXT: v_add_u32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v6 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off @@ -2123,45 +2157,50 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: v_mul_lo_u16_e32 v3, s0, v3 -; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s8, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v5, s9, v5 -; GFX9-NEXT: v_mul_lo_u16_sdwa v6, s10, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_bfe_u32 s1, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s0, s4, 0x40010 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: s_lshr_b32 s0, s2, 4 +; GFX9-NEXT: s_lshr_b32 s1, s2, 8 +; GFX9-NEXT: s_lshr_b32 s5, s2, 12 +; GFX9-NEXT: s_lshr_b32 s6, s4, 4 +; GFX9-NEXT: s_lshr_b32 s7, s4, 8 +; GFX9-NEXT: s_lshr_b32 s8, s4, 12 +; GFX9-NEXT: v_and_b32_e64 v3, s5, 15 +; GFX9-NEXT: v_and_b32_e64 v7, s8, 15 +; GFX9-NEXT: v_and_b32_e64 v4, s1, 15 +; GFX9-NEXT: v_and_b32_e64 v8, s7, 15 +; GFX9-NEXT: v_and_b32_e64 v5, s0, 15 +; GFX9-NEXT: v_and_b32_e64 v9, s6, 15 +; GFX9-NEXT: v_and_b32_e64 v6, s2, 15 +; GFX9-NEXT: v_and_b32_e64 v10, s4, 15 +; GFX9-NEXT: v_mul_lo_u16_e32 v6, v6, v10 +; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v5, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v4, v4, v8 +; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v3, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: s_lshr_b32 s1, s2, 20 +; GFX9-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_lshr_b32 s7, s4, 20 +; GFX9-NEXT: s_lshr_b32 s8, s4, 28 +; GFX9-NEXT: s_lshr_b32 s4, s4, 24 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v7, s4 -; GFX9-NEXT: v_mul_lo_u16_e32 v4, s6, v4 -; GFX9-NEXT: v_mul_lo_u16_sdwa v5, s7, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v6, s8, v6 -; GFX9-NEXT: v_mul_lo_u16_sdwa v7, s2, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e64 v9, s5, 15 +; GFX9-NEXT: v_and_b32_e64 v12, s4, 15 +; GFX9-NEXT: v_and_b32_e64 v10, s1, 15 +; GFX9-NEXT: v_and_b32_e64 v13, s7, 15 +; GFX9-NEXT: v_and_b32_e64 v11, s0, 15 +; GFX9-NEXT: v_and_b32_e64 v14, s6, 15 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; GFX9-NEXT: v_mul_lo_u16_sdwa v10, v10, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v11, v11, v14 +; GFX9-NEXT: v_mul_lo_u16_sdwa v13, s2, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v12 +; GFX9-NEXT: v_or_b32_e32 v7, v11, v10 +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 @@ -2186,45 +2225,50 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s0, v3 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s8, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s9, v5 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, s10, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: s_bfe_u32 s1, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 4 +; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 8 +; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 12 +; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 4 +; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 8 +; GFX9-DL-NEXT: s_lshr_b32 s8, s4, 12 +; GFX9-DL-NEXT: v_and_b32_e64 v3, s5, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v7, s8, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v4, s1, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v8, s7, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v5, s0, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v9, s6, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v6, s2, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v10, s4, 15 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v6, v6, v10 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, v5, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v4, v4, v8 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v3, v3, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX9-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 20 +; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 20 +; GFX9-DL-NEXT: s_lshr_b32 s8, s4, 28 +; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s4 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v4, s6, v4 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, s7, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v6, s8, v6 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, s2, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX9-DL-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e64 v9, s5, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v12, s4, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v10, s1, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v13, s7, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v11, s0, 15 +; GFX9-DL-NEXT: v_and_b32_e64 v14, s6, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v15, s8 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, v10, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v11, v11, v14 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v13, s2, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v12 +; GFX9-DL-NEXT: v_or_b32_e32 v7, v11, v10 +; GFX9-DL-NEXT: v_or_b32_sdwa v8, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_sdwa v4, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 Index: llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -229,18 +229,16 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 44 -; VI-NEXT: v_mov_b32_e32 v1, 3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s1, s0, 0xffff -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_add_i32 s1, s1, 12 -; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; VI-NEXT: s_or_b32 s0, s1, 4 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: v_or_b32_e32 v2, s0, v0 +; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; VI-NEXT: v_or_b32_e64 v1, s1, 4 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, 0x300, v0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: flat_store_short v[0:1], v2 Index: llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll =================================================================== --- llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll +++ llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll @@ -155,7 +155,7 @@ ; CHECK-LABEL: rotr_i64: ; CHECK: # %bb.0: ; CHECK-NEXT: neg 4, 4 -; CHECK-NEXT: rldcl 3, 3, 4, 0 +; CHECK-NEXT: rotld 3, 3, 4 ; CHECK-NEXT: blr %f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 %z) ret i64 %f Index: llvm/test/CodeGen/SystemZ/scalar-ctlz.ll =================================================================== --- llvm/test/CodeGen/SystemZ/scalar-ctlz.ll +++ llvm/test/CodeGen/SystemZ/scalar-ctlz.ll @@ -1,6 +1,4 @@ ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -; -; FIXME: two consecutive immediate adds not fused in i16/i8 functions. declare i64 @llvm.ctlz.i64(i64, i1) declare i32 @llvm.ctlz.i32(i32, i1) @@ -31,7 +29,7 @@ ; CHECK-LABEL: %bb.0: ; CHECK-NEXT: llgfr %r0, %r2 ; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 +; CHECK-NEXT: ahi %r2, -32 ; CHECK-NEXT: # kill ; CHECK-NEXT: br %r14 %1 = tail call i32 @llvm.ctlz.i32(i32 %arg, i1 false) @@ -43,7 +41,7 @@ ; CHECK-LABEL: %bb.0: ; CHECK-NEXT: llgfr %r0, %r2 ; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 +; CHECK-NEXT: ahi %r2, -32 ; CHECK-NEXT: # kill ; CHECK-NEXT: br %r14 %1 = tail call i32 @llvm.ctlz.i32(i32 %arg, i1 true) @@ -56,8 +54,7 @@ ; CHECK-NEXT: # kill ; CHECK-NEXT: llghr %r0, %r2 ; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 -; CHECK-NEXT: ahi %r2, -16 +; CHECK-NEXT: ahi %r2, -48 ; CHECK-NEXT: # kill ; CHECK-NEXT: br %r14 %1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 false) @@ -70,8 +67,7 @@ ; CHECK-NEXT: # kill ; CHECK-NEXT: llghr %r0, %r2 ; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 -; CHECK-NEXT: ahi %r2, -16 +; CHECK-NEXT: ahi %r2, -48 ; CHECK-NEXT: # kill ; CHECK-NEXT: br %r14 %1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 true) @@ -84,8 +80,7 @@ ; CHECK-NEXT: # kill ; CHECK-NEXT: llgcr %r0, %r2 ; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 -; CHECK-NEXT: ahi %r2, -24 +; CHECK-NEXT: ahi %r2, -56 ; CHECK-NEXT: # kill ; CHECK-NEXT: br %r14 %1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 false) @@ -98,8 +93,7 @@ ; CHECK-NEXT: # kill ; CHECK-NEXT: llgcr %r0, %r2 ; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 -; CHECK-NEXT: ahi %r2, -24 +; CHECK-NEXT: ahi %r2, -56 ; CHECK-NEXT: # kill ; CHECK-NEXT: br %r14 %1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 true) Index: llvm/test/CodeGen/X86/and-encoding.ll =================================================================== --- llvm/test/CodeGen/X86/and-encoding.ll +++ llvm/test/CodeGen/X86/and-encoding.ll @@ -22,7 +22,7 @@ define void @f2(i16 %x, i1 *%y) nounwind { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: -; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: andb $1, %dil # encoding: [0x40,0x80,0xe7,0x01] ; CHECK-NEXT: movb %dil, (%rsi) # encoding: [0x40,0x88,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] %c = trunc i16 %x to i1 @@ -33,7 +33,7 @@ define void @f3(i32 %x, i1 *%y) nounwind { ; CHECK-LABEL: f3: ; CHECK: # %bb.0: -; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: andb $1, %dil # encoding: [0x40,0x80,0xe7,0x01] ; CHECK-NEXT: movb %dil, (%rsi) # encoding: [0x40,0x88,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] %c = trunc i32 %x to i1 Index: llvm/test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-mask-op.ll +++ llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1816,15 +1816,15 @@ define void @store_i16_i1(i16 %x, i1 *%y) { ; CHECK-LABEL: store_i16_i1: ; CHECK: ## %bb.0: -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: andb $1, %dil ; CHECK-NEXT: movb %dil, (%rsi) ; CHECK-NEXT: retq ; ; X86-LABEL: store_i16_i1: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl $1, %ecx +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: andb $1, %cl ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: retl %c = trunc i16 %x to i1 @@ -1835,7 +1835,7 @@ define void @store_i8_i1(i8 %x, i1 *%y) { ; CHECK-LABEL: store_i8_i1: ; CHECK: ## %bb.0: -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: andb $1, %dil ; CHECK-NEXT: movb %dil, (%rsi) ; CHECK-NEXT: retq ; Index: llvm/test/CodeGen/X86/bool-math.ll =================================================================== --- llvm/test/CodeGen/X86/bool-math.ll +++ llvm/test/CodeGen/X86/bool-math.ll @@ -266,7 +266,7 @@ ; X64-NEXT: shrq $32, %rdi ; X64-NEXT: shrq $32, %rax ; X64-NEXT: xorl %edi, %eax -; X64-NEXT: andl $1, %eax +; X64-NEXT: andb $1, %al ; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq ; Index: llvm/test/CodeGen/X86/clz.ll =================================================================== --- llvm/test/CodeGen/X86/clz.ll +++ llvm/test/CodeGen/X86/clz.ll @@ -143,7 +143,7 @@ ; X32: # %bb.0: ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: bsrl %eax, %eax -; X32-NEXT: xorl $7, %eax +; X32-NEXT: xorb $7, %al ; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retl ; @@ -151,7 +151,7 @@ ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: bsrl %eax, %eax -; X64-NEXT: xorl $7, %eax +; X64-NEXT: xorb $7, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -159,7 +159,7 @@ ; X32-CLZ: # %bb.0: ; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-CLZ-NEXT: lzcntl %eax, %eax -; X32-CLZ-NEXT: addl $-24, %eax +; X32-CLZ-NEXT: addb $-24, %al ; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X32-CLZ-NEXT: retl ; @@ -167,7 +167,7 @@ ; X64-CLZ: # %bb.0: ; X64-CLZ-NEXT: movzbl %dil, %eax ; X64-CLZ-NEXT: lzcntl %eax, %eax -; X64-CLZ-NEXT: addl $-24, %eax +; X64-CLZ-NEXT: addb $-24, %al ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq %tmp2 = call i8 @llvm.ctlz.i8( i8 %x, i1 true ) @@ -285,7 +285,7 @@ ; X32-NEXT: # %bb.2: # %cond.false ; X32-NEXT: movzbl %al, %eax ; X32-NEXT: bsrl %eax, %eax -; X32-NEXT: xorl $7, %eax +; X32-NEXT: xorb $7, %al ; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retl ; X32-NEXT: .LBB8_1: @@ -300,7 +300,7 @@ ; X64-NEXT: # %bb.2: # %cond.false ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: bsrl %eax, %eax -; X64-NEXT: xorl $7, %eax +; X64-NEXT: xorb $7, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; X64-NEXT: .LBB8_1: @@ -312,7 +312,7 @@ ; X32-CLZ: # %bb.0: ; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-CLZ-NEXT: lzcntl %eax, %eax -; X32-CLZ-NEXT: addl $-24, %eax +; X32-CLZ-NEXT: addb $-24, %al ; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X32-CLZ-NEXT: retl ; @@ -320,7 +320,7 @@ ; X64-CLZ: # %bb.0: ; X64-CLZ-NEXT: movzbl %dil, %eax ; X64-CLZ-NEXT: lzcntl %eax, %eax -; X64-CLZ-NEXT: addl $-24, %eax +; X64-CLZ-NEXT: addb $-24, %al ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq %tmp1 = call i8 @llvm.ctlz.i8(i8 %n, i1 false) @@ -826,7 +826,7 @@ ; X32-NEXT: orb $64, %al ; X32-NEXT: movzbl %al, %eax ; X32-NEXT: bsrl %eax, %eax -; X32-NEXT: xorl $7, %eax +; X32-NEXT: xorb $7, %al ; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retl ; @@ -835,7 +835,7 @@ ; X64-NEXT: orb $64, %dil ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: bsrl %eax, %eax -; X64-NEXT: xorl $7, %eax +; X64-NEXT: xorb $7, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -845,7 +845,7 @@ ; X32-CLZ-NEXT: orb $64, %al ; X32-CLZ-NEXT: movzbl %al, %eax ; X32-CLZ-NEXT: lzcntl %eax, %eax -; X32-CLZ-NEXT: addl $-24, %eax +; X32-CLZ-NEXT: addb $-24, %al ; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X32-CLZ-NEXT: retl ; @@ -854,7 +854,7 @@ ; X64-CLZ-NEXT: orb $64, %dil ; X64-CLZ-NEXT: movzbl %dil, %eax ; X64-CLZ-NEXT: lzcntl %eax, %eax -; X64-CLZ-NEXT: addl $-24, %eax +; X64-CLZ-NEXT: addb $-24, %al ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq Index: llvm/test/CodeGen/X86/fast-isel-cmp.ll =================================================================== --- llvm/test/CodeGen/X86/fast-isel-cmp.ll +++ llvm/test/CodeGen/X86/fast-isel-cmp.ll @@ -9,7 +9,7 @@ ; SDAG: ## %bb.0: ; SDAG-NEXT: cmpeqss %xmm1, %xmm0 ; SDAG-NEXT: movd %xmm0, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; @@ -353,7 +353,7 @@ ; SDAG: ## %bb.0: ; SDAG-NEXT: cmpneqss %xmm1, %xmm0 ; SDAG-NEXT: movd %xmm0, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; @@ -593,7 +593,7 @@ ; SDAG-NEXT: xorps %xmm1, %xmm1 ; SDAG-NEXT: cmpeqss %xmm0, %xmm1 ; SDAG-NEXT: movd %xmm1, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; @@ -1248,7 +1248,7 @@ ; SDAG-NEXT: xorps %xmm1, %xmm1 ; SDAG-NEXT: cmpneqss %xmm0, %xmm1 ; SDAG-NEXT: movd %xmm1, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; Index: llvm/test/CodeGen/X86/funnel-shift.ll =================================================================== --- llvm/test/CodeGen/X86/funnel-shift.ll +++ llvm/test/CodeGen/X86/funnel-shift.ll @@ -107,9 +107,8 @@ ; X64-AVX2-NEXT: movq %rdi, %rax ; X64-AVX2-NEXT: movl %r8d, %ecx ; X64-AVX2-NEXT: shlq %cl, %rax -; X64-AVX2-NEXT: movl $37, %ecx -; X64-AVX2-NEXT: subl %r8d, %ecx -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-AVX2-NEXT: movb $37, %cl +; X64-AVX2-NEXT: subb %r8b, %cl ; X64-AVX2-NEXT: shrq %cl, %rsi ; X64-AVX2-NEXT: orq %rax, %rsi ; X64-AVX2-NEXT: testq %r8, %r8 @@ -293,9 +292,8 @@ ; X64-AVX2-NEXT: subq %rax, %r8 ; X64-AVX2-NEXT: movl %r8d, %ecx ; X64-AVX2-NEXT: shrq %cl, %r9 -; X64-AVX2-NEXT: movl $37, %ecx -; X64-AVX2-NEXT: subl %r8d, %ecx -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-AVX2-NEXT: movb $37, %cl +; X64-AVX2-NEXT: subb %r8b, %cl ; X64-AVX2-NEXT: shlq %cl, %rdi ; X64-AVX2-NEXT: orq %r9, %rdi ; X64-AVX2-NEXT: testq %r8, %r8 @@ -382,16 +380,15 @@ ; X32-SSE2-LABEL: fshl_i32_undef0_msk: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: andl $7, %ecx -; X32-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl +; X32-SSE2-NEXT: andb $7, %cl ; X32-SSE2-NEXT: shldl %cl, %eax, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshl_i32_undef0_msk: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: movl %esi, %ecx -; X64-AVX2-NEXT: andl $7, %ecx +; X64-AVX2-NEXT: andb $7, %cl ; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-AVX2-NEXT: shldl %cl, %edi, %eax ; X64-AVX2-NEXT: retq @@ -569,16 +566,15 @@ ; X32-SSE2-LABEL: fshr_i32_undef1_msk: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: andl $7, %ecx -; X32-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl +; X32-SSE2-NEXT: andb $7, %cl ; X32-SSE2-NEXT: shrdl %cl, %eax, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshr_i32_undef1_msk: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: movl %esi, %ecx -; X64-AVX2-NEXT: andl $7, %ecx +; X64-AVX2-NEXT: andb $7, %cl ; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-AVX2-NEXT: shrdl %cl, %edi, %eax ; X64-AVX2-NEXT: retq Index: llvm/test/CodeGen/X86/mul-constant-i8.ll =================================================================== --- llvm/test/CodeGen/X86/mul-constant-i8.ll +++ llvm/test/CodeGen/X86/mul-constant-i8.ll @@ -463,7 +463,7 @@ ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: addl %edi, %edi ; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: negl %eax +; X64-NEXT: negb %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, -10 @@ -476,7 +476,7 @@ ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: shll $2, %edi ; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: negl %eax +; X64-NEXT: negb %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, -36 Index: llvm/test/CodeGen/X86/pr15267.ll =================================================================== --- llvm/test/CodeGen/X86/pr15267.ll +++ llvm/test/CodeGen/X86/pr15267.ll @@ -73,62 +73,75 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq (%rdi), %rax ; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: shrl $4, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: shrb $4, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: andl $15, %edx +; CHECK-NEXT: andb $15, %dl +; CHECK-NEXT: movzbl %dl, %edx ; CHECK-NEXT: vmovd %edx, %xmm0 ; CHECK-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $8, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $12, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $16, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $20, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $24, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $28, %ecx ; CHECK-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $36, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $40, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $44, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $48, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $52, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $56, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: shrq $60, %rax ; CHECK-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 Index: llvm/test/CodeGen/X86/pr40539.ll =================================================================== --- llvm/test/CodeGen/X86/pr40539.ll +++ llvm/test/CodeGen/X86/pr40539.ll @@ -18,7 +18,7 @@ ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cmpeqss (%esp), %xmm0 ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: popl %ecx ; CHECK-NEXT: .cfi_def_cfa_offset 4 Index: llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll =================================================================== --- llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll +++ llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll @@ -15,7 +15,7 @@ ; ; CHECK32-LABEL: f32_bzhi: ; CHECK32: # %bb.0: # %entry -; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al ; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax ; CHECK32-NEXT: retl entry: @@ -34,7 +34,7 @@ ; ; CHECK32-LABEL: f32_bzhi_partial: ; CHECK32: # %bb.0: # %entry -; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al ; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax ; CHECK32-NEXT: retl entry: Index: llvm/test/CodeGen/X86/shift-double-x86_64.ll =================================================================== --- llvm/test/CodeGen/X86/shift-double-x86_64.ll +++ llvm/test/CodeGen/X86/shift-double-x86_64.ll @@ -8,7 +8,6 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdx, %rcx ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: andl $63, %ecx ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shldq %cl, %rsi, %rax ; CHECK-NEXT: retq @@ -25,7 +24,6 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdx, %rcx ; CHECK-NEXT: movq %rsi, %rax -; CHECK-NEXT: andl $63, %ecx ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shrdq %cl, %rdi, %rax ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/shift-double.ll =================================================================== --- llvm/test/CodeGen/X86/shift-double.ll +++ llvm/test/CodeGen/X86/shift-double.ll @@ -290,11 +290,9 @@ define i32 @test11(i32 %hi, i32 %lo, i32 %bits) nounwind { ; X86-LABEL: test11: ; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl $31, %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shldl %cl, %edx, %eax ; X86-NEXT: retl ; @@ -302,7 +300,6 @@ ; X64: # %bb.0: ; X64-NEXT: movl %edx, %ecx ; X64-NEXT: movl %edi, %eax -; X64-NEXT: andl $31, %ecx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shldl %cl, %esi, %eax ; X64-NEXT: retq @@ -317,11 +314,9 @@ define i32 @test12(i32 %hi, i32 %lo, i32 %bits) nounwind { ; X86-LABEL: test12: ; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl $31, %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shrdl %cl, %edx, %eax ; X86-NEXT: retl ; @@ -329,7 +324,6 @@ ; X64: # %bb.0: ; X64-NEXT: movl %edx, %ecx ; X64-NEXT: movl %esi, %eax -; X64-NEXT: andl $31, %ecx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrdl %cl, %edi, %eax ; X64-NEXT: retq Index: llvm/test/CodeGen/X86/vector-sext-widen.ll =================================================================== --- llvm/test/CodeGen/X86/vector-sext-widen.ll +++ llvm/test/CodeGen/X86/vector-sext-widen.ll @@ -3419,79 +3419,95 @@ ; SSE2-NEXT: movzwl (%rdi), %eax ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $15, %ecx +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $14, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $13, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $12, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $11, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $10, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $9, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $7, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $6, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $5, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: shrb $7, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $4, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: shrb $6, %cl +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $3, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: shrb $5, %cl +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $2, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: shrb $4, %cl +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: shrb $3, %cl +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: shrl %eax -; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrb $2, %cl +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: shrb %al +; SSE2-NEXT: andb $1, %al +; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: movd %eax, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psllw $15, %xmm0 ; SSE2-NEXT: psraw $15, %xmm0 @@ -3505,79 +3521,95 @@ ; SSSE3-NEXT: movzwl (%rdi), %eax ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $15, %ecx +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $14, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $13, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $12, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $11, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $10, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $9, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $8, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $7, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $6, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $5, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: shrb $7, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $4, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: shrb $6, %cl +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm3 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $3, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: shrb $5, %cl +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $2, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: shrb $4, %cl +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: shrb $3, %cl +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: shrl %eax -; SSSE3-NEXT: andl $1, %eax +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrb $2, %cl +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: shrb %al +; SSSE3-NEXT: andb $1, %al +; SSSE3-NEXT: movzbl %al, %eax ; SSSE3-NEXT: movd %eax, %xmm4 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: psllw $15, %xmm0 ; SSSE3-NEXT: psraw $15, %xmm0 @@ -3590,65 +3622,81 @@ ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: movzwl (%rdi), %eax ; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: shrb %cl +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: andl $1, %edx +; SSE41-NEXT: andb $1, %dl +; SSE41-NEXT: movzbl %dl, %edx ; SSE41-NEXT: movd %edx, %xmm1 ; SSE41-NEXT: pinsrb $1, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $2, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: shrb $2, %cl +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $2, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $3, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: shrb $3, %cl +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $3, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $4, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: shrb $4, %cl +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $4, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $5, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: shrb $5, %cl +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $5, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $6, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: shrb $6, %cl +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $6, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $7, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx ; SSE41-NEXT: shrl $8, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx ; SSE41-NEXT: shrl $9, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $9, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx ; SSE41-NEXT: shrl $10, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $10, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx ; SSE41-NEXT: shrl $11, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $11, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx ; SSE41-NEXT: shrl $12, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $12, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx ; SSE41-NEXT: shrl $13, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $13, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx ; SSE41-NEXT: shrl $14, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 ; SSE41-NEXT: shrl $15, %eax +; SSE41-NEXT: andb $1, %al +; SSE41-NEXT: movzbl %al, %eax ; SSE41-NEXT: pinsrb $15, %eax, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: psllw $15, %xmm0 @@ -3869,82 +3917,97 @@ ; X32-SSE2-LABEL: load_sext_16i1_to_16i16: ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movzwl (%eax), %eax +; X32-SSE2-NEXT: movl (%eax), %eax ; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shrl $15, %ecx +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm0 ; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shrl $14, %ecx -; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm1 ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shrl $13, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 ; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shrl $12, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shrl $11, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 ; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shrl $10, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm3 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X32-SSE2-NEXT: movb %ah, %cl +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 ; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shrl $9, %ecx -; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrb $7, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm0 ; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $8, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: shrb $6, %cl +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $7, %ecx -; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: shrb $5, %cl +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm0 ; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $6, %ecx -; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: shrb $4, %cl +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm2 ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $5, %ecx -; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: shrb $3, %cl +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm0 ; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $4, %ecx -; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: shrb $2, %cl +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm3 ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $3, %ecx -; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $2, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: shrl %eax -; X32-SSE2-NEXT: andl $1, %eax +; X32-SSE2-NEXT: shrb %al +; X32-SSE2-NEXT: andb $1, %al +; X32-SSE2-NEXT: movzbl %al, %eax ; X32-SSE2-NEXT: movd %eax, %xmm4 ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: psllw $15, %xmm0 ; X32-SSE2-NEXT: psraw $15, %xmm0 @@ -3956,67 +4019,82 @@ ; X32-SSE41-LABEL: load_sext_16i1_to_16i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movzwl (%eax), %eax +; X32-SSE41-NEXT: movl (%eax), %eax ; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: shrb %cl +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: movl %eax, %edx -; X32-SSE41-NEXT: andl $1, %edx +; X32-SSE41-NEXT: andb $1, %dl +; X32-SSE41-NEXT: movzbl %dl, %edx ; X32-SSE41-NEXT: movd %edx, %xmm1 ; X32-SSE41-NEXT: pinsrb $1, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $2, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: shrb $2, %cl +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $2, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $3, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: shrb $3, %cl +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $3, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $4, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: shrb $4, %cl +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $4, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $5, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: shrb $5, %cl +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $5, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $6, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: shrb $6, %cl +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $6, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $7, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: shrb $7, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $7, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $8, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: movb %ah, %cl +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $8, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx ; X32-SSE41-NEXT: shrl $9, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $9, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx ; X32-SSE41-NEXT: shrl $10, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $10, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx ; X32-SSE41-NEXT: shrl $11, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $11, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx ; X32-SSE41-NEXT: shrl $12, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $12, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx ; X32-SSE41-NEXT: shrl $13, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $13, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx ; X32-SSE41-NEXT: shrl $14, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $14, %ecx, %xmm1 ; X32-SSE41-NEXT: shrl $15, %eax +; X32-SSE41-NEXT: andb $1, %al +; X32-SSE41-NEXT: movzbl %al, %eax ; X32-SSE41-NEXT: pinsrb $15, %eax, %xmm1 ; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; X32-SSE41-NEXT: psllw $15, %xmm0 Index: llvm/test/CodeGen/X86/vector-sext.ll =================================================================== --- llvm/test/CodeGen/X86/vector-sext.ll +++ llvm/test/CodeGen/X86/vector-sext.ll @@ -3419,79 +3419,95 @@ ; SSE2-NEXT: movzwl (%rdi), %eax ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $15, %ecx +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $14, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $13, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $12, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $11, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $10, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $9, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $7, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $6, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $5, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: shrb $7, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $4, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: shrb $6, %cl +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $3, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: shrb $5, %cl +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $2, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: shrb $4, %cl +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: shrb $3, %cl +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: shrl %eax -; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrb $2, %cl +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: shrb %al +; SSE2-NEXT: andb $1, %al +; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: movd %eax, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psllw $15, %xmm0 ; SSE2-NEXT: psraw $15, %xmm0 @@ -3505,79 +3521,95 @@ ; SSSE3-NEXT: movzwl (%rdi), %eax ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $15, %ecx +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $14, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $13, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $12, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $11, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $10, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $9, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $8, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $7, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $6, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $5, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: shrb $7, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $4, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: shrb $6, %cl +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm3 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $3, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: shrb $5, %cl +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $2, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: shrb $4, %cl +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: shrb $3, %cl +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: shrl %eax -; SSSE3-NEXT: andl $1, %eax +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrb $2, %cl +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: andb $1, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: shrb %al +; SSSE3-NEXT: andb $1, %al +; SSSE3-NEXT: movzbl %al, %eax ; SSSE3-NEXT: movd %eax, %xmm4 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: psllw $15, %xmm0 ; SSSE3-NEXT: psraw $15, %xmm0 @@ -3590,65 +3622,81 @@ ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: movzwl (%rdi), %eax ; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: shrb %cl +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: andl $1, %edx +; SSE41-NEXT: andb $1, %dl +; SSE41-NEXT: movzbl %dl, %edx ; SSE41-NEXT: movd %edx, %xmm1 ; SSE41-NEXT: pinsrb $1, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $2, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: shrb $2, %cl +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $2, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $3, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: shrb $3, %cl +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $3, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $4, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: shrb $4, %cl +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $4, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $5, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: shrb $5, %cl +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $5, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $6, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: shrb $6, %cl +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $6, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $7, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx ; SSE41-NEXT: shrl $8, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx ; SSE41-NEXT: shrl $9, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $9, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx ; SSE41-NEXT: shrl $10, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $10, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx ; SSE41-NEXT: shrl $11, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $11, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx ; SSE41-NEXT: shrl $12, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $12, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx ; SSE41-NEXT: shrl $13, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $13, %ecx, %xmm1 ; SSE41-NEXT: movl %eax, %ecx ; SSE41-NEXT: shrl $14, %ecx -; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 ; SSE41-NEXT: shrl $15, %eax +; SSE41-NEXT: andb $1, %al +; SSE41-NEXT: movzbl %al, %eax ; SSE41-NEXT: pinsrb $15, %eax, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: psllw $15, %xmm0 @@ -3869,82 +3917,97 @@ ; X32-SSE2-LABEL: load_sext_16i1_to_16i16: ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movzwl (%eax), %eax +; X32-SSE2-NEXT: movl (%eax), %eax ; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shrl $15, %ecx +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm0 ; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shrl $14, %ecx -; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm1 ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shrl $13, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 ; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shrl $12, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shrl $11, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 ; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shrl $10, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm3 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X32-SSE2-NEXT: movb %ah, %cl +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 ; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shrl $9, %ecx -; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrb $7, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm0 ; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $8, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: shrb $6, %cl +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $7, %ecx -; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: shrb $5, %cl +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm0 ; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $6, %ecx -; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: shrb $4, %cl +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm2 ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $5, %ecx -; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: shrb $3, %cl +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm0 ; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $4, %ecx -; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: shrb $2, %cl +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm3 ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $3, %ecx -; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: andb $1, %cl +; X32-SSE2-NEXT: movzbl %cl, %ecx ; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $2, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: shrl %eax -; X32-SSE2-NEXT: andl $1, %eax +; X32-SSE2-NEXT: shrb %al +; X32-SSE2-NEXT: andb $1, %al +; X32-SSE2-NEXT: movzbl %al, %eax ; X32-SSE2-NEXT: movd %eax, %xmm4 ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: psllw $15, %xmm0 ; X32-SSE2-NEXT: psraw $15, %xmm0 @@ -3956,67 +4019,82 @@ ; X32-SSE41-LABEL: load_sext_16i1_to_16i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movzwl (%eax), %eax +; X32-SSE41-NEXT: movl (%eax), %eax ; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: shrb %cl +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: movl %eax, %edx -; X32-SSE41-NEXT: andl $1, %edx +; X32-SSE41-NEXT: andb $1, %dl +; X32-SSE41-NEXT: movzbl %dl, %edx ; X32-SSE41-NEXT: movd %edx, %xmm1 ; X32-SSE41-NEXT: pinsrb $1, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $2, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: shrb $2, %cl +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $2, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $3, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: shrb $3, %cl +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $3, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $4, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: shrb $4, %cl +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $4, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $5, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: shrb $5, %cl +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $5, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $6, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: shrb $6, %cl +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $6, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $7, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: shrb $7, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $7, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $8, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: movb %ah, %cl +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $8, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx ; X32-SSE41-NEXT: shrl $9, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $9, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx ; X32-SSE41-NEXT: shrl $10, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $10, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx ; X32-SSE41-NEXT: shrl $11, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $11, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx ; X32-SSE41-NEXT: shrl $12, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $12, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx ; X32-SSE41-NEXT: shrl $13, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $13, %ecx, %xmm1 ; X32-SSE41-NEXT: movl %eax, %ecx ; X32-SSE41-NEXT: shrl $14, %ecx -; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: andb $1, %cl +; X32-SSE41-NEXT: movzbl %cl, %ecx ; X32-SSE41-NEXT: pinsrb $14, %ecx, %xmm1 ; X32-SSE41-NEXT: shrl $15, %eax +; X32-SSE41-NEXT: andb $1, %al +; X32-SSE41-NEXT: movzbl %al, %eax ; X32-SSE41-NEXT: pinsrb $15, %eax, %xmm1 ; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; X32-SSE41-NEXT: psllw $15, %xmm0