diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1528,6 +1528,7 @@ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); @@ -20430,7 +20431,8 @@ if (!TLI.isTypeLegal(InVT)) { if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) && VT.is128BitVector()) { - assert(Subtarget.hasVLX() && "Unexpected subtarget!"); + assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) && + "Unexpected subtarget!"); // The default behavior is to truncate one step, concatenate, and then // truncate the remainder. We'd rather produce two 64-bit results and // concatenate those. @@ -36977,6 +36979,17 @@ unsigned BitWidth = OriginalDemandedBits.getBitWidth(); unsigned Opc = Op.getOpcode(); switch(Opc) { + case X86ISD::VTRUNC: { + KnownBits KnownOp; + SDValue Src = Op.getOperand(0); + + // Simplify the input, using demanded bit information. + unsigned SrcBitWidth = Src.getScalarValueSizeInBits(); + APInt TruncMask = OriginalDemandedBits.zext(SrcBitWidth); + if (SimplifyDemandedBits(Src, TruncMask, KnownOp, TLO, Depth + 1)) + return true; + break; + } case X86ISD::PMULDQ: case X86ISD::PMULUDQ: { // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. @@ -43781,7 +43794,8 @@ return combineVectorTruncation(N, DAG, Subtarget); } -static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) { +static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); SDLoc DL(N); @@ -43791,6 +43805,11 @@ if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits())); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) + return SDValue(N, 0); + return SDValue(); } @@ -47520,7 +47539,7 @@ case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); - case X86ISD::VTRUNC: return combineVTRUNC(N, DAG); + case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI); case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget); case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -300,12 +300,11 @@ ; ; AVX512-LABEL: trunc_add_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <16 x i64> %a0, %a1 @@ -731,10 +730,9 @@ ; ; AVX512-LABEL: trunc_add_const_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1146,12 +1144,11 @@ ; ; AVX512-LABEL: trunc_sub_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1 ; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <16 x i64> %a0, %a1 @@ -1545,10 +1542,9 @@ ; ; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -2079,38 +2075,31 @@ ; ; AVX512F-LABEL: trunc_mul_v16i64_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovqd %zmm3, %ymm3 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = mul <16 x i64> %a0, %a1 @@ -2587,16 +2576,35 @@ ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -; AVX512-LABEL: trunc_mul_const_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmuludq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpmuludq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmuludq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmuludq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %1 = mul <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> ret <16 x i8> %2 @@ -3024,12 +3032,11 @@ ; ; AVX512-LABEL: trunc_and_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 ; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <16 x i64> %a0, %a1 @@ -3396,10 +3403,9 @@ ; ; AVX512-LABEL: trunc_and_const_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -3789,12 +3795,11 @@ ; ; AVX512-LABEL: trunc_xor_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 ; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <16 x i64> %a0, %a1 @@ -4161,10 +4166,9 @@ ; ; AVX512-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -4554,12 +4558,11 @@ ; ; AVX512-LABEL: trunc_or_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 ; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <16 x i64> %a0, %a1 @@ -4926,10 +4929,9 @@ ; ; AVX512-LABEL: trunc_or_const_v16i64_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -5033,57 +5033,16 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_packus_v16i64_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminsq (%rdi), %zmm0, %zmm1 -; AVX512F-NEXT: vpminsq 64(%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_packus_v16i64_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmaxsq 64(%rdi), %zmm0, %zmm1 -; AVX512VL-NEXT: vpmovusqb %zmm1, %xmm1 -; AVX512VL-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovusqb %zmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_packus_v16i64_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpminsq (%rdi), %zmm0, %zmm1 -; AVX512BW-NEXT: vpminsq 64(%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_packus_v16i64_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsq 64(%rdi), %zmm0, %zmm1 -; AVX512BWVL-NEXT: vpmovusqb %zmm1, %xmm1 -; AVX512BWVL-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_packus_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsq 64(%rdi), %zmm0, %zmm1 +; AVX512-NEXT: vpmovusqb %zmm1, %xmm1 +; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 +; AVX512-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_packus_v16i64_v16i8: ; SKX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -4835,55 +4835,15 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_ssat_v16i64_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpminsq (%rdi), %zmm0, %zmm1 -; AVX512F-NEXT: vpminsq 64(%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_ssat_v16i64_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VL-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512VL-NEXT: vpmovsqb %zmm1, %xmm1 -; AVX512VL-NEXT: vpmovsqb %zmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_ssat_v16i64_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm0 = [127,127,127,127,127,127,127,127] -; AVX512BW-NEXT: vpminsq (%rdi), %zmm0, %zmm1 -; AVX512BW-NEXT: vpminsq 64(%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_ssat_v16i64_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BWVL-NEXT: vpmovsqb %zmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovsqb %zmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_ssat_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vpmovsqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v16i64_v16i8: ; SKX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -3543,49 +3543,15 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_usat_v16i64_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminuq 64(%rdi), %zmm0, %zmm1 -; AVX512F-NEXT: vpminuq (%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_usat_v16i64_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VL-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512VL-NEXT: vpmovusqb %zmm1, %xmm1 -; AVX512VL-NEXT: vpmovusqb %zmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_usat_v16i64_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpminuq 64(%rdi), %zmm0, %zmm1 -; AVX512BW-NEXT: vpminuq (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_usat_v16i64_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BWVL-NEXT: vpmovusqb %zmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_usat_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vpmovusqb %zmm1, %xmm1 +; AVX512-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_usat_v16i64_v16i8: ; SKX: # %bb.0: