Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -1763,6 +1763,7 @@ setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); } // We want to custom lower some of our intrinsics. @@ -19329,9 +19330,11 @@ assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); - // If called by the legalizer just return. - if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) { - if ((InVT == MVT::v8i64 || InVT == MVT::v16i32) && VT.is128BitVector()) { + // If we're called by the type legalizer, handle a few cases. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(InVT)) { + if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) && + VT.is128BitVector()) { assert(Subtarget.hasVLX() && "Unexpected subtarget!"); // The default behavior is to truncate one step, concatenate, and then // truncate the remainder. We'd rather produce two 64-bit results and @@ -27958,6 +27961,23 @@ return; } } + if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 && + getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector && + isTypeLegal(MVT::v4i64)) { + // Input needs to be split and output needs to widened. Let's use two + // VTRUNCs, and shuffle their results together into the wider type. + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(In, dl); + + Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo); + Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi); + SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi, + { 0, 1, 2, 3, 16, 17, 18, 19, + -1, -1, -1, -1, -1, -1, -1, -1 }); + Results.push_back(Res); + return; + } + return; } case ISD::ANY_EXTEND: Index: llvm/trunk/test/CodeGen/X86/min-legal-vector-width.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/min-legal-vector-width.ll +++ llvm/trunk/test/CodeGen/X86/min-legal-vector-width.ll @@ -797,14 +797,12 @@ ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3 -; CHECK-NEXT: vpmovqd %ymm2, %xmm2 -; CHECK-NEXT: vpmovqd %ymm3, %xmm3 -; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; CHECK-NEXT: vpmovdb %ymm2, %xmm2 -; CHECK-NEXT: vpmovqd %ymm0, %xmm0 -; CHECK-NEXT: vpmovqd %ymm1, %xmm1 -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vpmovdb %ymm0, %xmm0 +; CHECK-NEXT: vpmovqb %ymm3, %xmm3 +; CHECK-NEXT: vpmovqb %ymm2, %xmm2 +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: vpmovqb %ymm1, %xmm1 +; CHECK-NEXT: vpmovqb %ymm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -829,24 +827,15 @@ } define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { -; CHECK-AVX512-LABEL: trunc_v8i64_v8i8: -; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-AVX512-NEXT: vpmovqb %ymm1, %xmm1 -; CHECK-AVX512-NEXT: vpmovqb %ymm0, %xmm0 -; CHECK-AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-AVX512-NEXT: vzeroupper -; CHECK-AVX512-NEXT: retq -; -; CHECK-VBMI-LABEL: trunc_v8i64_v8i8: -; CHECK-VBMI: # %bb.0: -; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-VBMI-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224] -; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 -; CHECK-VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-VBMI-NEXT: vzeroupper -; CHECK-VBMI-NEXT: retq +; CHECK-LABEL: trunc_v8i64_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpmovqb %ymm1, %xmm1 +; CHECK-NEXT: vpmovqb %ymm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %a = load <8 x i64>, <8 x i64>* %x %b = trunc <8 x i64> %a to <8 x i8> ret <8 x i8> %b Index: llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll +++ llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll @@ -2732,20 +2732,57 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_packus_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_packus_v16i64_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v16i64_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512VL-NEXT: vpmovusqb %zmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v16i64_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminsq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v16i64_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512BWVL-NEXT: vpmovusqb %zmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq %1 = icmp slt <16 x i64> %a0, %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> %3 = icmp sgt <16 x i64> %2, zeroinitializer Index: llvm/trunk/test/CodeGen/X86/vector-trunc-ssat.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-trunc-ssat.ll +++ llvm/trunk/test/CodeGen/X86/vector-trunc-ssat.ll @@ -2717,20 +2717,51 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_ssat_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127] -; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_ssat_v16i64_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX512F-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v16i64_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovsqb %zmm1, %xmm1 +; AVX512VL-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v16i64_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpminsq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v16i64_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovsqb %zmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq %1 = icmp slt <16 x i64> %a0, %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> %3 = icmp sgt <16 x i64> %2, Index: llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll +++ llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll @@ -1842,17 +1842,45 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_usat_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpminuq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpminuq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_usat_v16i64_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminuq %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_usat_v16i64_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovusqb %zmm1, %xmm1 +; AVX512VL-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_usat_v16i64_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminuq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpminuq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_usat_v16i64_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovusqb %zmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq %1 = icmp ult <16 x i64> %a0, %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> %3 = trunc <16 x i64> %2 to <16 x i8>