Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10625,8 +10625,7 @@ } // Simplify the operands using demanded-bits information. - if (!VT.isVector() && - SimplifyDemandedBits(SDValue(N, 0))) + if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry) Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1696,7 +1696,8 @@ // zero/one bits live out. unsigned OperandBitWidth = Src.getScalarValueSizeInBits(); APInt TruncMask = DemandedBits.zext(OperandBitWidth); - if (SimplifyDemandedBits(Src, TruncMask, Known, TLO, Depth + 1)) + if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, Known, TLO, + Depth + 1)) return true; Known = Known.trunc(BitWidth); @@ -1719,16 +1720,12 @@ // undesirable. break; - auto *ShAmt = dyn_cast(Src.getOperand(1)); + SDValue ShiftAmt = Src.getOperand(1); + ConstantSDNode *ShAmt = isConstOrConstSplat(ShiftAmt, DemandedElts); if (!ShAmt || ShAmt->getAPIntValue().uge(BitWidth)) break; - - SDValue Shift = Src.getOperand(1); uint64_t ShVal = ShAmt->getZExtValue(); - if (TLO.LegalTypes()) - Shift = TLO.DAG.getConstant(ShVal, dl, getShiftAmountTy(VT, DL)); - APInt HighBits = APInt::getHighBitsSet(OperandBitWidth, OperandBitWidth - BitWidth); HighBits.lshrInPlace(ShVal); @@ -1737,10 +1734,12 @@ if (!(HighBits & DemandedBits)) { // None of the shifted in bits are needed. Add a truncate of the // shift input, then shift it. + if (TLO.LegalTypes()) + ShiftAmt = TLO.DAG.getConstant(ShVal, dl, getShiftAmountTy(VT, DL)); SDValue NewTrunc = TLO.DAG.getNode(ISD::TRUNCATE, dl, VT, Src.getOperand(0)); return TLO.CombineTo( - Op, TLO.DAG.getNode(ISD::SRL, dl, VT, NewTrunc, Shift)); + Op, TLO.DAG.getNode(ISD::SRL, dl, VT, NewTrunc, ShiftAmt)); } break; } Index: test/CodeGen/AMDGPU/idot4s.ll =================================================================== --- test/CodeGen/AMDGPU/idot4s.ll +++ test/CodeGen/AMDGPU/idot4s.ll @@ -929,34 +929,30 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x80000 -; GFX8-NEXT: s_lshr_b32 s4, s2, 16 -; GFX8-NEXT: s_bfe_i32 s5, s1, 0x80000 -; GFX8-NEXT: v_ashrrev_i16_e64 v4, 8, s1 -; GFX8-NEXT: s_bfe_i32 s1, s3, 0x80000 -; GFX8-NEXT: v_ashrrev_i16_e64 v6, 8, s3 -; GFX8-NEXT: s_and_b32 s3, s0, s6 -; GFX8-NEXT: v_ashrrev_i16_e64 v3, 8, s2 -; GFX8-NEXT: s_bfe_i32 s2, s4, 0x80000 -; GFX8-NEXT: v_ashrrev_i16_e64 v5, 8, s4 -; GFX8-NEXT: s_and_b32 s4, s0, s5 -; GFX8-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NEXT: s_and_b32 s2, s0, s2 -; GFX8-NEXT: s_and_b32 s0, s0, s1 +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s2 +; GFX8-NEXT: s_sext_i32_i8 s0, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s0 +; GFX8-NEXT: s_sext_i32_i8 s1, s2 +; GFX8-NEXT: s_bfe_i32 s0, s3, 0x80010 +; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8 +; GFX8-NEXT: s_bfe_i32 s4, s2, 0x80010 +; GFX8-NEXT: s_ashr_i32 s3, s3, 24 +; GFX8-NEXT: v_mov_b32_e32 v6, s0 +; GFX8-NEXT: s_ashr_i32 s2, s2, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v4, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v6, v5, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s1, v5, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v6, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; Index: test/CodeGen/AMDGPU/idot4u.ll =================================================================== --- test/CodeGen/AMDGPU/idot4u.ll +++ test/CodeGen/AMDGPU/idot4u.ll @@ -2037,34 +2037,33 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: s_and_b32 s7, s1, s8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_lshr_b32 s3, s1, 24 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX8-NEXT: s_and_b32 s5, s0, s8 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mul_u32_u24_e32 v4, s5, v4 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: s_lshr_b32 s4, s1, 24 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: s_mul_i32 s0, s0, s1 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s4, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, s3, v5 ; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v4, v3, v5 +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 Index: test/CodeGen/AMDGPU/idot8s.ll =================================================================== --- test/CodeGen/AMDGPU/idot8s.ll +++ test/CodeGen/AMDGPU/idot8s.ll @@ -1641,68 +1641,60 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s5, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4 -; GFX8-NEXT: s_lshr_b32 s0, s2, 4 -; GFX8-NEXT: s_lshr_b32 s1, s2, 8 -; GFX8-NEXT: s_lshr_b32 s5, s4, 4 -; GFX8-NEXT: s_lshr_b32 s6, s4, 8 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s6 -; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5 -; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX8-NEXT: s_lshr_b32 s0, s2, 12 -; GFX8-NEXT: s_lshr_b32 s1, s4, 12 -; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s1 -; GFX8-NEXT: s_lshr_b32 s5, s2, 16 -; GFX8-NEXT: s_lshr_b32 s6, s4, 16 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v7 -; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s6 -; GFX8-NEXT: s_lshr_b32 s0, s2, 20 -; GFX8-NEXT: s_lshr_b32 s1, s4, 20 -; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s1 -; GFX8-NEXT: s_lshr_b32 s5, s2, 24 -; GFX8-NEXT: s_lshr_b32 s6, s4, 24 -; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s6 -; GFX8-NEXT: s_lshr_b32 s0, s2, 28 -; GFX8-NEXT: s_lshr_b32 s1, s4, 28 -; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s1 -; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 -; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 -; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 +; GFX8-NEXT: s_lshl_b32 s1, s5, 28 +; GFX8-NEXT: s_lshl_b32 s9, s7, 28 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX8-NEXT: s_lshl_b32 s11, s7, 24 +; GFX8-NEXT: s_lshl_b32 s9, s7, 20 +; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[8:9], 60 +; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: s_lshl_b32 s13, s5, 24 +; GFX8-NEXT: s_lshl_b32 s1, s5, 20 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[12:13], 60 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 +; GFX8-NEXT: v_mov_b32_e32 v4, s14 +; GFX8-NEXT: s_lshl_b32 s1, s7, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, s10 +; GFX8-NEXT: s_lshl_b32 s11, s7, 12 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s9, s5, 16 +; GFX8-NEXT: v_mul_i32_i24_e32 v4, s12, v4 +; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[8:9], 60 +; GFX8-NEXT: s_lshl_b32 s9, s7, 8 +; GFX8-NEXT: s_lshl_b32 s1, s5, 12 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 +; GFX8-NEXT: s_lshl_b32 s11, s5, 8 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s7, 4 +; GFX8-NEXT: v_mov_b32_e32 v7, s10 +; GFX8-NEXT: s_lshl_b32 s9, s5, 4 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[0:1], 60 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[10:11], 60 +; GFX8-NEXT: v_mov_b32_e32 v8, s16 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[8:9], 60 +; GFX8-NEXT: v_mov_b32_e32 v9, s20 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v6, v8, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX8-NEXT: v_mad_u32_u24 v2, v9, v10, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v11, v12, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v13, v14, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v15, v17, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v16, v18, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s8, v5, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX8-NEXT: v_mad_i32_i24 v2, s12, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s14, v7, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s18, v8, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s22, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2027,87 +2019,83 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s8, s1, 4 -; GFX8-NEXT: s_lshr_b32 s9, s1, 12 -; GFX8-NEXT: s_lshr_b32 s10, s1, 8 -; GFX8-NEXT: s_lshr_b32 s15, s2, 4 -; GFX8-NEXT: s_lshr_b32 s16, s2, 12 -; GFX8-NEXT: s_lshr_b32 s17, s2, 8 -; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s10 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s9 -; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s8 -; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s17 -; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s16 -; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s15 -; GFX8-NEXT: s_lshr_b32 s4, s1, 20 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshr_b32 s6, s1, 28 -; GFX8-NEXT: s_lshr_b32 s7, s1, 24 -; GFX8-NEXT: s_lshr_b32 s11, s2, 20 -; GFX8-NEXT: s_lshr_b32 s12, s2, 16 -; GFX8-NEXT: s_lshr_b32 s13, s2, 28 -; GFX8-NEXT: s_lshr_b32 s14, s2, 24 -; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s7 -; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s6 -; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s4 -; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s14 -; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s13 -; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s12 -; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s11 -; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v5, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 -; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 -; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v9, v10, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX8-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v5, s0, v9 -; GFX8-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX8-NEXT: v_or_b32_e32 v6, v5, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v6 +; GFX8-NEXT: s_lshl_b32 s13, s1, 24 +; GFX8-NEXT: s_lshl_b32 s17, s1, 16 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s25, s5, 24 +; GFX8-NEXT: s_lshl_b32 s27, s5, 28 +; GFX8-NEXT: s_lshl_b32 s29, s5, 16 +; GFX8-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s15, s1, 28 +; GFX8-NEXT: s_lshl_b32 s19, s5, 8 +; GFX8-NEXT: s_lshl_b32 s21, s5, 12 +; GFX8-NEXT: s_lshl_b32 s23, s5, 4 +; GFX8-NEXT: s_lshl_b32 s5, s5, 20 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60 +; GFX8-NEXT: s_ashr_i64 s[26:27], s[26:27], 60 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[28:29], 60 +; GFX8-NEXT: s_lshl_b32 s7, s1, 8 +; GFX8-NEXT: s_lshl_b32 s9, s1, 12 +; GFX8-NEXT: s_lshl_b32 s11, s1, 4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 20 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX8-NEXT: v_mov_b32_e32 v6, s28 +; GFX8-NEXT: v_mov_b32_e32 v7, s16 +; GFX8-NEXT: v_mov_b32_e32 v8, s26 +; GFX8-NEXT: v_mov_b32_e32 v9, s24 +; GFX8-NEXT: v_mov_b32_e32 v10, s12 +; GFX8-NEXT: v_mul_i32_i24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_i32_i24_e32 v7, s14, v8 +; GFX8-NEXT: v_mul_i32_i24_sdwa v8, v10, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mul_i32_i24_e32 v5, s0, v5 +; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v6, s2, v7 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[20:21], 60 +; GFX8-NEXT: v_mov_b32_e32 v3, s22 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: s_ashr_i64 s[32:33], s[22:23], 60 +; GFX8-NEXT: v_mul_i32_i24_sdwa v3, v4, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v12, s18 +; GFX8-NEXT: v_mov_b32_e32 v13, s6 +; GFX8-NEXT: s_ashr_i64 s[30:31], s[10:11], 60 +; GFX8-NEXT: v_mov_b32_e32 v11, s32 +; GFX8-NEXT: v_mul_i32_i24_e32 v4, s8, v4 +; GFX8-NEXT: v_mul_i32_i24_sdwa v10, v13, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_i32_i24_e32 v9, s30, v11 +; GFX8-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; Index: test/CodeGen/AMDGPU/widen-smrd-loads.ll =================================================================== --- test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -237,8 +237,8 @@ ; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; VI-NEXT: s_or_b32 s0, s1, 4 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: v_or_b32_e32 v2, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: flat_store_short v[0:1], v2 Index: test/CodeGen/ARM/lowerMUL-newload.ll =================================================================== --- test/CodeGen/ARM/lowerMUL-newload.ll +++ test/CodeGen/ARM/lowerMUL-newload.ll @@ -4,23 +4,25 @@ define void @func1(i16* %a, i16* %b, i16* %c) { ; CHECK-LABEL: func1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add r3, r1, #16 +; CHECK-NEXT: vldr d16, [r2, #16] +; CHECK-NEXT: vldr d17, [r1, #16] +; CHECK-NEXT: vmovl.u16 q9, d16 +; CHECK-NEXT: vaddw.u16 q9, q9, d17 +; CHECK-NEXT: vmovn.i32 d18, q9 +; CHECK-NEXT: vldr d19, [r0, #16] +; CHECK-NEXT: vstr d18, [r0, #16] ; CHECK-NEXT: vldr d18, [r2, #16] -; CHECK-NEXT: vld1.16 {d16}, [r3:64] -; CHECK-NEXT: vmovl.u16 q8, d16 -; CHECK-NEXT: vaddw.s16 q10, q8, d18 -; CHECK-NEXT: vmovn.i32 d19, q10 -; CHECK-NEXT: vldr d20, [r0, #16] -; CHECK-NEXT: vstr d19, [r0, #16] -; CHECK-NEXT: vldr d19, [r2, #16] -; CHECK-NEXT: vmull.s16 q11, d18, d19 -; CHECK-NEXT: vmovl.s16 q9, d19 -; CHECK-NEXT: vmla.i32 q11, q8, q9 -; CHECK-NEXT: vmovn.i32 d16, q11 +; CHECK-NEXT: vmull.s16 q10, d16, d18 +; CHECK-NEXT: vmovl.s16 q11, d18 +; CHECK-NEXT: vmovl.u16 q8, d17 +; CHECK-NEXT: vmovl.u16 q9, d19 +; CHECK-NEXT: vmla.i32 q10, q8, q11 +; CHECK-NEXT: vmovn.i32 d16, q10 ; CHECK-NEXT: vstr d16, [r1, #16] ; CHECK-NEXT: vldr d16, [r2, #16] -; CHECK-NEXT: vmlal.s16 q11, d16, d20 -; CHECK-NEXT: vmovn.i32 d16, q11 +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vmla.i32 q10, q8, q9 +; CHECK-NEXT: vmovn.i32 d16, q10 ; CHECK-NEXT: vstr d16, [r0, #16] ; CHECK-NEXT: bx lr entry: @@ -75,23 +77,26 @@ define void @func2(i16* %a, i16* %b, i16* %c) { ; CHECK-LABEL: func2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add r3, r1, #16 +; CHECK-NEXT: vldr d16, [r2, #16] +; CHECK-NEXT: add r3, r0, #16 +; CHECK-NEXT: vldr d17, [r1, #16] +; CHECK-NEXT: vmovl.u16 q9, d16 +; CHECK-NEXT: vaddw.u16 q9, q9, d17 +; CHECK-NEXT: vmovn.i32 d18, q9 +; CHECK-NEXT: vld1.16 {d19}, [r3:64] +; CHECK-NEXT: vstr d18, [r0, #16] ; CHECK-NEXT: vldr d18, [r2, #16] -; CHECK-NEXT: vld1.16 {d16}, [r3:64] -; CHECK-NEXT: vmovl.u16 q8, d16 -; CHECK-NEXT: vaddw.s16 q10, q8, d18 -; CHECK-NEXT: vmovn.i32 d19, q10 -; CHECK-NEXT: vldr d20, [r0, #16] -; CHECK-NEXT: vstr d19, [r0, #16] -; CHECK-NEXT: vldr d19, [r2, #16] -; CHECK-NEXT: vmull.s16 q11, d18, d19 +; CHECK-NEXT: vmull.s16 q10, d16, d18 +; CHECK-NEXT: vmovl.s16 q11, d18 +; CHECK-NEXT: vmovl.u16 q8, d17 ; CHECK-NEXT: vmovl.s16 q9, d19 -; CHECK-NEXT: vmla.i32 q11, q8, q9 -; CHECK-NEXT: vmovn.i32 d16, q11 +; CHECK-NEXT: vmla.i32 q10, q8, q11 +; CHECK-NEXT: vmovn.i32 d16, q10 ; CHECK-NEXT: vstr d16, [r1, #16] ; CHECK-NEXT: vldr d16, [r2, #16] -; CHECK-NEXT: vmlal.s16 q11, d16, d20 -; CHECK-NEXT: vaddw.s16 q8, q11, d20 +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vmla.i32 q10, q8, q9 +; CHECK-NEXT: vadd.i32 q8, q10, q9 ; CHECK-NEXT: vmovn.i32 d16, q8 ; CHECK-NEXT: vstr d16, [r0, #16] ; CHECK-NEXT: bx lr Index: test/CodeGen/X86/avg.ll =================================================================== --- test/CodeGen/X86/avg.ll +++ test/CodeGen/X86/avg.ll @@ -266,113 +266,97 @@ ; AVX1-LABEL: avg_v48i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm12 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm9 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm3, %xmm15, %xmm15 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vmovdqa (%rsi), %xmm6 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm11, %xmm7 +; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm14 +; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm13 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm3, %xmm12, %xmm11 -; AVX1-NEXT: vpsubd %xmm3, %xmm10, %xmm10 -; AVX1-NEXT: vpsubd %xmm3, %xmm9, %xmm9 -; AVX1-NEXT: vpsubd %xmm3, %xmm8, %xmm8 -; AVX1-NEXT: vpsubd %xmm3, %xmm15, %xmm12 -; AVX1-NEXT: vpsubd %xmm3, %xmm7, %xmm7 -; AVX1-NEXT: vpsubd %xmm3, %xmm14, %xmm0 -; AVX1-NEXT: vpsubd %xmm3, %xmm13, %xmm2 -; AVX1-NEXT: vpsubd %xmm3, %xmm5, %xmm5 -; AVX1-NEXT: vpsubd %xmm3, %xmm6, %xmm6 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm6, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm4 -; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm7, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm12, %xmm4 +; AVX1-NEXT: vpaddd %xmm1, %xmm15, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm2, %xmm13, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm2, %xmm12, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm4, %xmm14, %xmm4 ; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm8, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm9, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm4, %xmm11, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm5, %xmm10, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm10, %xmm5 -; AVX1-NEXT: vpsrld $1, %xmm11, %xmm6 -; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm2 -; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm5, %xmm9, %xmm5 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsubw %xmm5, %xmm7, %xmm6 +; AVX1-NEXT: vpsrlw $1, %xmm6, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpsubw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsubw %xmm5, %xmm4, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpsubw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovdqu %xmm2, (%rax) ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vmovdqu %xmm4, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v48i8: @@ -380,64 +364,68 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero ; AVX2-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-NEXT: vmovdqa 16(%rsi), %xmm7 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm5, %ymm3, %ymm3 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm5, %ymm9, %ymm5 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 -; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 -; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3 -; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: vpackusdw %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],ymm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-NEXT: vpackusdw %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpackuswb %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm5[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm9, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm9, %ymm4, %ymm4 +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm7, %ymm1, %ymm7 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm0, %ymm5, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm8 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm3 +; AVX2-NEXT: vpsrlw $1, %xmm3, %xmm5 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm5, %xmm5 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX2-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 +; AVX2-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX2-NEXT: vpackusdw %xmm4, %xmm7, %xmm4 +; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 +; AVX2-NEXT: vpsrlw $1, %xmm4, %xmm4 +; AVX2-NEXT: vpand %xmm3, %xmm4, %xmm4 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm5 +; AVX2-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX2-NEXT: vpsubw %xmm2, %xmm5, %xmm5 +; AVX2-NEXT: vpsrlw $1, %xmm5, %xmm5 +; AVX2-NEXT: vpand %xmm3, %xmm5, %xmm5 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX2-NEXT: vpackusdw %xmm4, %xmm8, %xmm4 +; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm2 +; AVX2-NEXT: vpsrlw $1, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqu %xmm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) Index: test/CodeGen/X86/combine-sra.ll =================================================================== --- test/CodeGen/X86/combine-sra.ll +++ test/CodeGen/X86/combine-sra.ll @@ -248,15 +248,17 @@ ; ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: combine_vec_ashr_trunc_ashr: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,5,7,6,7] +; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper Index: test/CodeGen/X86/known-signbits-vector.ll =================================================================== --- test/CodeGen/X86/known-signbits-vector.ll +++ test/CodeGen/X86/known-signbits-vector.ll @@ -161,9 +161,8 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) nounwind { ; X32-LABEL: signbits_sext_shuffle_sitofp: ; X32: # %bb.0: -; X32-NEXT: vpmovsxdq %xmm0, %xmm1 -; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X32-NEXT: vpmovsxdq %xmm0, %xmm0 +; X32-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X32-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -174,9 +173,8 @@ ; ; X64-LABEL: signbits_sext_shuffle_sitofp: ; X64: # %bb.0: -; X64-NEXT: vpmovsxdq %xmm0, %xmm1 -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-NEXT: vpmovsxdq %xmm0, %xmm0 +; X64-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -194,7 +192,8 @@ define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind { ; X32-LABEL: signbits_ashr_concat_ashr_extract_sitofp: ; X32: # %bb.0: -; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X32-NEXT: vpsrlq $32, %xmm0, %xmm0 +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X32-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X32-NEXT: retl ; Index: test/CodeGen/X86/min-legal-vector-width.ll =================================================================== --- test/CodeGen/X86/min-legal-vector-width.ll +++ test/CodeGen/X86/min-legal-vector-width.ll @@ -808,10 +808,9 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" { ; CHECK-LABEL: trunc_v16i32_v16i16_sign: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsrad $16, 32(%rdi), %ymm0 -; CHECK-NEXT: vpsrad $16, (%rdi), %ymm1 -; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %a = load <16 x i32>, <16 x i32>* %x %b = ashr <16 x i32> %a, @@ -822,9 +821,9 @@ define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" { ; CHECK-LABEL: trunc_v32i16_v32i8_sign: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsraw $8, 32(%rdi), %ymm0 -; CHECK-NEXT: vpsraw $8, (%rdi), %ymm1 -; CHECK-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpsrlw $8, 32(%rdi), %ymm0 +; CHECK-NEXT: vpsrlw $8, (%rdi), %ymm1 +; CHECK-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: retq %a = load <32 x i16>, <32 x i16>* %x Index: test/CodeGen/X86/vector-trunc.ll =================================================================== --- test/CodeGen/X86/vector-trunc.ll +++ test/CodeGen/X86/vector-trunc.ll @@ -72,24 +72,28 @@ ; ; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc8i64_8i32_ashr: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32_ashr: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: retq entry: @@ -401,33 +405,48 @@ } define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) { -; SSE-LABEL: trunc8i32_8i16_ashr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: trunc8i32_8i16_ashr: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i32_8i16_ashr: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,10,11,14,15,14,15,255,255] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i32_8i16_ashr: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc8i32_8i16_ashr: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc8i32_8i16_ashr: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc8i32_8i16_ashr: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper @@ -435,14 +454,14 @@ ; ; AVX512VL-LABEL: trunc8i32_8i16_ashr: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc8i32_8i16_ashr: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -450,7 +469,7 @@ ; ; AVX512BWVL-LABEL: trunc8i32_8i16_ashr: ; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -703,28 +722,52 @@ } define void @trunc16i32_16i16_ashr(<16 x i32> %a) { -; SSE-LABEL: trunc16i32_16i16_ashr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: psrad $16, %xmm3 -; SSE-NEXT: psrad $16, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: movdqu %xmm2, (%rax) -; SSE-NEXT: movdqu %xmm0, (%rax) -; SSE-NEXT: retq +; SSE2-LABEL: trunc16i32_16i16_ashr: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i16_ashr: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: psrad $16, %xmm3 +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: packssdw %xmm3, %xmm2 +; SSSE3-NEXT: movdqu %xmm2, (%rax) +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i16_ashr: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: movdqu %xmm2, (%rax) +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc16i32_16i16_ashr: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -732,9 +775,9 @@ ; ; AVX2-LABEL: trunc16i32_16i16_ashr: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1 -; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -909,41 +952,65 @@ } define void @trunc16i32_16i8_ashr(<16 x i32> %a) { -; SSE-LABEL: trunc16i32_16i8_ashr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: psrad $24, %xmm1 -; SSE-NEXT: psrad $24, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: psrad $24, %xmm3 -; SSE-NEXT: psrad $24, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: packsswb %xmm2, %xmm0 -; SSE-NEXT: movdqu %xmm0, (%rax) -; SSE-NEXT: retq +; SSE2-LABEL: trunc16i32_16i8_ashr: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrld $24, %xmm1 +; SSE2-NEXT: psrld $24, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: psrld $24, %xmm3 +; SSE2-NEXT: psrld $24, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i8_ashr: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: psrld $24, %xmm1 +; SSSE3-NEXT: psrld $24, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: psrld $24, %xmm3 +; SSSE3-NEXT: psrld $24, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i8_ashr: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrld $24, %xmm1 +; SSE41-NEXT: psrld $24, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psrld $24, %xmm3 +; SSE41-NEXT: psrld $24, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc16i32_16i8_ashr: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc16i32_16i8_ashr: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrad $24, %ymm1, %ymm1 -; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1121,27 +1188,27 @@ define void @trunc16i16_16i8_ashr(<16 x i16> %a) { ; SSE-LABEL: trunc16i16_16i8_ashr: ; SSE: # %bb.0: # %entry -; SSE-NEXT: psraw $8, %xmm1 -; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: movdqu %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc16i16_16i8_ashr: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc16i16_16i8_ashr: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1164,7 +1231,7 @@ ; ; AVX512BW-LABEL: trunc16i16_16i8_ashr: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpsraw $8, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper