diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11952,8 +11952,7 @@ } // Simplify the operands using demanded-bits information. - if (!VT.isVector() && - SimplifyDemandedBits(SDValue(N, 0))) + if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1986,7 +1986,8 @@ // zero/one bits live out. unsigned OperandBitWidth = Src.getScalarValueSizeInBits(); APInt TruncMask = DemandedBits.zext(OperandBitWidth); - if (SimplifyDemandedBits(Src, TruncMask, Known, TLO, Depth + 1)) + if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, Known, TLO, + Depth + 1)) return true; Known = Known.trunc(BitWidth); @@ -2009,9 +2010,9 @@ // undesirable. break; - SDValue ShAmt = Src.getOperand(1); - auto *ShAmtC = dyn_cast(ShAmt); - if (!ShAmtC || ShAmtC->getAPIntValue().uge(BitWidth)) + const APInt *ShAmtC = + TLO.DAG.getValidShiftAmountConstant(Src, DemandedElts); + if (!ShAmtC) break; uint64_t ShVal = ShAmtC->getZExtValue(); @@ -2023,6 +2024,7 @@ if (!(HighBits & DemandedBits)) { // None of the shifted in bits are needed. Add a truncate of the // shift input, then shift it. + SDValue ShAmt = Src.getOperand(1); if (TLO.LegalTypes()) ShAmt = TLO.DAG.getConstant(ShVal, dl, getShiftAmountTy(VT, DL)); SDValue NewTrunc = diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3399,6 +3399,7 @@ static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { return N->getOpcode() == ISD::SIGN_EXTEND || + N->getOpcode() == ISD::ANY_EXTEND || isExtendedBUILD_VECTOR(N, DAG, true); } diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -96,7 +96,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b +; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A @@ -113,7 +113,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h +; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -131,7 +131,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEXT: movi v1.2d, #0x000000ffffffff ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -258,7 +258,7 @@ ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] -; CHECK-NEXT: umlal v0.8h, v1.8b, v2.8b +; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b ; CHECK-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A @@ -278,7 +278,7 @@ ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] -; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h +; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -299,7 +299,7 @@ ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] -; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s +; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s ; CHECK-NEXT: movi v1.2d, #0x000000ffffffff ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -428,7 +428,7 @@ ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] -; CHECK-NEXT: umlsl v0.8h, v1.8b, v2.8b +; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b ; CHECK-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A @@ -448,7 +448,7 @@ ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] -; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h +; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -469,7 +469,7 @@ ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ldr d2, [x2] -; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s +; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s ; CHECK-NEXT: movi v1.2d, #0x000000ffffffff ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -586,7 +586,7 @@ ; CHECK-LABEL: amull_extvec_v8i8_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.8b, #12 -; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b +; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-NEXT: ret %tmp3 = zext <8 x i8> %arg to <8 x i16> @@ -600,7 +600,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #1234 ; CHECK-NEXT: dup v1.4h, w8 -; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h +; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -615,7 +615,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #1234 ; CHECK-NEXT: dup v1.2s, w8 -; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEXT: movi v1.2d, #0x000000ffffffff ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -752,8 +752,8 @@ define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) { ; CHECK-LABEL: amull2_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: umull v2.8h, v0.8b, v1.8b -; CHECK-NEXT: umull2 v1.8h, v0.16b, v1.16b +; CHECK-NEXT: smull v2.8h, v0.8b, v1.8b +; CHECK-NEXT: smull2 v1.8h, v0.16b, v1.16b ; CHECK-NEXT: bic v2.8h, #255, lsl #8 ; CHECK-NEXT: bic v1.8h, #255, lsl #8 ; CHECK-NEXT: mov v0.16b, v2.16b @@ -768,8 +768,8 @@ define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) { ; CHECK-LABEL: amull2_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: umull v2.4s, v0.4h, v1.4h -; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: smull v2.4s, v0.4h, v1.4h +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h ; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff ; CHECK-NEXT: and v1.16b, v0.16b, v3.16b ; CHECK-NEXT: and v0.16b, v2.16b, v3.16b @@ -784,8 +784,8 @@ define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) { ; CHECK-LABEL: amull2_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: umull v2.2d, v0.2s, v1.2s -; CHECK-NEXT: umull2 v0.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v2.2d, v0.2s, v1.2s +; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.4s ; CHECK-NEXT: movi v3.2d, #0x000000ffffffff ; CHECK-NEXT: and v1.16b, v0.16b, v3.16b ; CHECK-NEXT: and v0.16b, v2.16b, v3.16b diff --git a/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll b/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll --- a/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll +++ b/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: mlai16_trunc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h -; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h +; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret entry: @@ -21,7 +21,7 @@ define <4 x i32> @mlai16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) { ; CHECK-LABEL: mlai16_and: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: umull v0.4s, v1.4h, v0.4h +; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h ; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -43,7 +43,7 @@ ; CHECK-NEXT: ldr d1, [x1, #16] ; CHECK-NEXT: ldr d2, [x2, #16] ; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h -; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h +; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: str d0, [x0, #16] ; CHECK-NEXT: ret @@ -89,8 +89,8 @@ define <4 x i32> @addmuli16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) { ; CHECK-LABEL: addmuli16_and: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h -; CHECK-NEXT: umlal v1.4s, v0.4h, v2.4h +; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h +; CHECK-NEXT: smlal v1.4s, v0.4h, v2.4h ; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff ; CHECK-NEXT: and v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret @@ -141,7 +141,7 @@ ; CHECK-LABEL: mlai32_trunc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s -; CHECK-NEXT: saddw v0.2d, v0.2d, v2.2s +; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: ret entry: @@ -157,7 +157,7 @@ define <2 x i64> @mlai32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) { ; CHECK-LABEL: mlai32_and: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: umull v0.2d, v1.2s, v0.2s +; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s ; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s ; CHECK-NEXT: movi v1.2d, #0x000000ffffffff ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -179,7 +179,7 @@ ; CHECK-NEXT: ldr d1, [x1, #32] ; CHECK-NEXT: ldr d2, [x2, #32] ; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s -; CHECK-NEXT: saddw v0.2d, v0.2d, v2.2s +; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: str d0, [x0, #32] ; CHECK-NEXT: ret @@ -225,8 +225,8 @@ define <2 x i64> @addmuli32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) { ; CHECK-LABEL: addmuli32_and: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: umlal v1.2d, v0.2s, v2.2s +; CHECK-NEXT: smull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: smlal v1.2d, v0.2s, v2.2s ; CHECK-NEXT: movi v0.2d, #0x000000ffffffff ; CHECK-NEXT: and v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret @@ -359,7 +359,7 @@ ; CHECK-NEXT: str d1, [x1, #16] ; CHECK-NEXT: ldr d1, [x2, #16] ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h -; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h +; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: str d0, [x0, #16] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -240,8 +240,8 @@ ; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; VI-NEXT: s_or_b32 s0, s1, 4 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: v_or_b32_e32 v2, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: flat_store_short v[0:1], v2 diff --git a/llvm/test/CodeGen/ARM/lowerMUL-newload.ll b/llvm/test/CodeGen/ARM/lowerMUL-newload.ll --- a/llvm/test/CodeGen/ARM/lowerMUL-newload.ll +++ b/llvm/test/CodeGen/ARM/lowerMUL-newload.ll @@ -4,8 +4,8 @@ define arm_aapcs_vfpcc <4 x i16> @mla_args(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) { ; CHECK-LABEL: mla_args: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmull.s16 q8, d1, d0 -; CHECK-NEXT: vaddw.s16 q8, q8, d2 +; CHECK-NEXT: vmull.u16 q8, d1, d0 +; CHECK-NEXT: vaddw.u16 q8, q8, d2 ; CHECK-NEXT: vmovn.i32 d0, q8 ; CHECK-NEXT: bx lr entry: @@ -24,8 +24,8 @@ ; CHECK-NEXT: vldr d16, [r0, #16] ; CHECK-NEXT: vldr d17, [r1, #16] ; CHECK-NEXT: vldr d18, [r2, #16] -; CHECK-NEXT: vmull.s16 q8, d17, d16 -; CHECK-NEXT: vaddw.s16 q8, q8, d18 +; CHECK-NEXT: vmull.u16 q8, d17, d16 +; CHECK-NEXT: vaddw.u16 q8, q8, d18 ; CHECK-NEXT: vmovn.i32 d16, q8 ; CHECK-NEXT: vstr d16, [r0, #16] ; CHECK-NEXT: bx lr @@ -54,8 +54,8 @@ define arm_aapcs_vfpcc <4 x i16> @addmul_args(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) { ; CHECK-LABEL: addmul_args: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmull.s16 q8, d1, d2 -; CHECK-NEXT: vmlal.s16 q8, d0, d2 +; CHECK-NEXT: vmull.u16 q8, d1, d2 +; CHECK-NEXT: vmlal.u16 q8, d0, d2 ; CHECK-NEXT: vmovn.i32 d0, q8 ; CHECK-NEXT: bx lr entry: @@ -73,9 +73,9 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldr d16, [r2, #16] ; CHECK-NEXT: vldr d17, [r1, #16] -; CHECK-NEXT: vmull.s16 q9, d17, d16 +; CHECK-NEXT: vmull.u16 q9, d17, d16 ; CHECK-NEXT: vldr d17, [r0, #16] -; CHECK-NEXT: vmlal.s16 q9, d17, d16 +; CHECK-NEXT: vmlal.u16 q9, d17, d16 ; CHECK-NEXT: vmovn.i32 d16, q9 ; CHECK-NEXT: vstr d16, [r0, #16] ; CHECK-NEXT: bx lr @@ -108,7 +108,7 @@ ; CHECK-NEXT: vldr d18, [r2, #16] ; CHECK-NEXT: vld1.16 {d16}, [r3:64] ; CHECK-NEXT: vmovl.u16 q8, d16 -; CHECK-NEXT: vaddw.s16 q10, q8, d18 +; CHECK-NEXT: vaddw.u16 q10, q8, d18 ; CHECK-NEXT: vmovn.i32 d19, q10 ; CHECK-NEXT: vldr d20, [r0, #16] ; CHECK-NEXT: vstr d19, [r0, #16] @@ -119,7 +119,7 @@ ; CHECK-NEXT: vmovn.i32 d16, q11 ; CHECK-NEXT: vstr d16, [r1, #16] ; CHECK-NEXT: vldr d16, [r2, #16] -; CHECK-NEXT: vmlal.s16 q11, d16, d20 +; CHECK-NEXT: vmlal.u16 q11, d16, d20 ; CHECK-NEXT: vmovn.i32 d16, q11 ; CHECK-NEXT: vstr d16, [r0, #16] ; CHECK-NEXT: bx lr @@ -175,23 +175,26 @@ define void @func2(i16* %a, i16* %b, i16* %c) { ; CHECK-LABEL: func2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add r3, r1, #16 +; CHECK-NEXT: vldr d16, [r1, #16] +; CHECK-NEXT: add r3, r0, #16 +; CHECK-NEXT: vldr d17, [r2, #16] +; CHECK-NEXT: vaddl.u16 q9, d17, d16 +; CHECK-NEXT: vmovn.i32 d18, q9 +; CHECK-NEXT: vld1.16 {d19}, [r3:64] +; CHECK-NEXT: vstr d18, [r0, #16] ; CHECK-NEXT: vldr d18, [r2, #16] -; CHECK-NEXT: vld1.16 {d16}, [r3:64] +; CHECK-NEXT: vmull.s16 q10, d17, d18 +; CHECK-NEXT: vmovl.s16 q11, d18 ; CHECK-NEXT: vmovl.u16 q8, d16 -; CHECK-NEXT: vaddw.s16 q10, q8, d18 -; CHECK-NEXT: vmovn.i32 d19, q10 -; CHECK-NEXT: vldr d20, [r0, #16] -; CHECK-NEXT: vstr d19, [r0, #16] -; CHECK-NEXT: vldr d19, [r2, #16] -; CHECK-NEXT: vmull.s16 q11, d18, d19 ; CHECK-NEXT: vmovl.s16 q9, d19 -; CHECK-NEXT: vmla.i32 q11, q8, q9 -; CHECK-NEXT: vmovn.i32 d16, q11 +; CHECK-NEXT: vmla.i32 q10, q8, q11 +; CHECK-NEXT: vmovn.i32 d16, q10 ; CHECK-NEXT: vstr d16, [r1, #16] -; CHECK-NEXT: vldr d16, [r2, #16] -; CHECK-NEXT: vmlal.s16 q11, d16, d20 -; CHECK-NEXT: vaddw.s16 q8, q11, d20 +; CHECK-NEXT: add r1, r2, #16 +; CHECK-NEXT: vld1.16 {d16}, [r1:64] +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vmla.i32 q10, q8, q9 +; CHECK-NEXT: vadd.i32 q8, q10, q9 ; CHECK-NEXT: vmovn.i32 d16, q8 ; CHECK-NEXT: vstr d16, [r0, #16] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -1503,7 +1503,6 @@ ; CHECK-NEXT: vldrht.s32 q3, [r1], #8 ; CHECK-NEXT: vmul.i32 q2, q3, q2 ; CHECK-NEXT: vqshrnb.s32 q2, q2, #15 -; CHECK-NEXT: vmovlb.s16 q2, q2 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrht.32 q2, [r2], #8 ; CHECK-NEXT: le lr, .LBB8_2 @@ -2717,7 +2716,6 @@ ; CHECK-NEXT: vldrbt.s16 q6, [r1], #8 ; CHECK-NEXT: vmul.i16 q5, q6, q5 ; CHECK-NEXT: vqshrnb.s16 q5, q5, #7 -; CHECK-NEXT: vmovlb.s8 q5, q5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrbt.16 q5, [r2], #8 ; CHECK-NEXT: le lr, .LBB17_2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll @@ -153,7 +153,7 @@ ; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 ; CHECK-NEXT: vmov.u16 r1, q1[4] ; CHECK-NEXT: vmullb.s16 q2, q3, q2 -; CHECK-NEXT: vshr.s32 q3, q2, #16 +; CHECK-NEXT: vshr.u32 q3, q2, #16 ; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmov.16 q2[0], r0 ; CHECK-NEXT: vmov r0, s13 @@ -174,7 +174,7 @@ ; CHECK-NEXT: vmov.u16 r1, q0[5] ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vmullb.s16 q0, q1, q3 -; CHECK-NEXT: vshr.s32 q0, q0, #16 +; CHECK-NEXT: vshr.u32 q0, q0, #16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.16 q2[4], r0 ; CHECK-NEXT: vmov r0, s1 @@ -318,7 +318,7 @@ ; CHECK-NEXT: vmov.u8 r0, q0[7] ; CHECK-NEXT: vmov.16 q3[7], r0 ; CHECK-NEXT: vmullb.s8 q2, q3, q2 -; CHECK-NEXT: vshr.s16 q3, q2, #8 +; CHECK-NEXT: vshr.u16 q3, q2, #8 ; CHECK-NEXT: vmov.u16 r0, q3[0] ; CHECK-NEXT: vmov.8 q2[0], r0 ; CHECK-NEXT: vmov.u16 r0, q3[1] @@ -368,7 +368,7 @@ ; CHECK-NEXT: vmov.u8 r0, q0[15] ; CHECK-NEXT: vmov.16 q1[7], r0 ; CHECK-NEXT: vmullb.s8 q0, q1, q3 -; CHECK-NEXT: vshr.s16 q0, q0, #8 +; CHECK-NEXT: vshr.u16 q0, q0, #8 ; CHECK-NEXT: vmov.u16 r0, q0[0] ; CHECK-NEXT: vmov.8 q2[8], r0 ; CHECK-NEXT: vmov.u16 r0, q0[1] diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -252,7 +252,7 @@ ; ; AVX2-FAST-LABEL: combine_vec_ashr_trunc_ashr: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <1,3,5,7,u,u,u,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -251,9 +251,8 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) nounwind { ; X86-LABEL: signbits_sext_shuffle_sitofp: ; X86: # %bb.0: -; X86-NEXT: vpmovsxdq %xmm0, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; X86-NEXT: vpmovsxdq %xmm0, %xmm0 +; X86-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -263,9 +262,8 @@ ; ; X64-AVX1-LABEL: signbits_sext_shuffle_sitofp: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -275,7 +273,7 @@ ; ; X64-AVX2-LABEL: signbits_sext_shuffle_sitofp: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -920,10 +920,9 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" { ; CHECK-LABEL: trunc_v16i32_v16i16_sign: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsrad $16, 32(%rdi), %ymm0 -; CHECK-NEXT: vpsrad $16, (%rdi), %ymm1 -; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %a = load <16 x i32>, <16 x i32>* %x %b = ashr <16 x i32> %a, @@ -932,13 +931,20 @@ } define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: trunc_v32i16_v32i8_sign: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsraw $8, 32(%rdi), %ymm0 -; CHECK-NEXT: vpsraw $8, (%rdi), %ymm1 -; CHECK-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; CHECK-NEXT: retq +; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_sign: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpsrlw $8, 32(%rdi), %ymm0 +; CHECK-AVX512-NEXT: vpsrlw $8, (%rdi), %ymm1 +; CHECK-AVX512-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-AVX512-NEXT: retq +; +; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_sign: +; CHECK-VBMI: # %bb.0: +; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 +; CHECK-VBMI-NEXT: retq %a = load <32 x i16>, <32 x i16>* %x %b = ashr <32 x i16> %a, %c = trunc <32 x i16> %b to <32 x i8> diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -73,7 +73,7 @@ ; ; AVX2-FAST-LABEL: trunc8i64_8i32_ashr: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -81,7 +81,7 @@ ; ; AVX512-LABEL: trunc8i64_8i32_ashr: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: retq entry: @@ -383,33 +383,47 @@ } define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) { -; SSE-LABEL: trunc8i32_8i16_ashr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: trunc8i32_8i16_ashr: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i32_8i16_ashr: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i32_8i16_ashr: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc8i32_8i16_ashr: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc8i32_8i16_ashr: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc8i32_8i16_ashr: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper @@ -417,14 +431,14 @@ ; ; AVX512VL-LABEL: trunc8i32_8i16_ashr: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc8i32_8i16_ashr: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -432,7 +446,7 @@ ; ; AVX512BWVL-LABEL: trunc8i32_8i16_ashr: ; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -684,28 +698,52 @@ } define void @trunc16i32_16i16_ashr(<16 x i32> %a) { -; SSE-LABEL: trunc16i32_16i16_ashr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: psrad $16, %xmm3 -; SSE-NEXT: psrad $16, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: movdqu %xmm2, (%rax) -; SSE-NEXT: movdqu %xmm0, (%rax) -; SSE-NEXT: retq +; SSE2-LABEL: trunc16i32_16i16_ashr: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i16_ashr: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: psrad $16, %xmm3 +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: packssdw %xmm3, %xmm2 +; SSSE3-NEXT: movdqu %xmm2, (%rax) +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i16_ashr: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: movdqu %xmm2, (%rax) +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc16i32_16i16_ashr: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -713,9 +751,9 @@ ; ; AVX2-LABEL: trunc16i32_16i16_ashr: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1 -; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -890,40 +928,64 @@ } define void @trunc16i32_16i8_ashr(<16 x i32> %a) { -; SSE-LABEL: trunc16i32_16i8_ashr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: psrad $24, %xmm1 -; SSE-NEXT: psrad $24, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: psrad $24, %xmm3 -; SSE-NEXT: psrad $24, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: packsswb %xmm2, %xmm0 -; SSE-NEXT: movdqu %xmm0, (%rax) -; SSE-NEXT: retq +; SSE2-LABEL: trunc16i32_16i8_ashr: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrld $24, %xmm1 +; SSE2-NEXT: psrld $24, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: psrld $24, %xmm3 +; SSE2-NEXT: psrld $24, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i8_ashr: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: psrld $24, %xmm1 +; SSSE3-NEXT: psrld $24, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: psrld $24, %xmm3 +; SSSE3-NEXT: psrld $24, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i8_ashr: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrld $24, %xmm1 +; SSE41-NEXT: psrld $24, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psrld $24, %xmm3 +; SSE41-NEXT: psrld $24, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc16i32_16i8_ashr: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc16i32_16i8_ashr: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrad $24, %ymm1, %ymm1 -; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -1084,27 +1146,27 @@ define void @trunc16i16_16i8_ashr(<16 x i16> %a) { ; SSE-LABEL: trunc16i16_16i8_ashr: ; SSE: # %bb.0: # %entry -; SSE-NEXT: psraw $8, %xmm1 -; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: movdqu %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc16i16_16i8_ashr: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc16i16_16i8_ashr: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1127,7 +1189,7 @@ ; ; AVX512BW-LABEL: trunc16i16_16i8_ashr: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpsraw $8, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper