Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1777,6 +1777,9 @@ setOperationAction(ISD::FSHR, VT, Custom); } } + + setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom); } // We want to custom lower some of our intrinsics. @@ -19041,8 +19044,42 @@ "Invalid TRUNCATE operation"); // If called by the legalizer just return. - if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) + if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) { + if ((InVT == MVT::v8i64 || InVT == MVT::v16i32) && VT.is128BitVector()) { + assert(Subtarget.hasVLX() && "Unexpected subtarget!"); + // The default behavior is to truncate one step, concatenate, and then + // truncate the remainder. We'd rather produce two 64-bit results and + // concatenate those. + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(In, DL); + + if (!ExperimentalVectorWideningLegalization) { + // Without vector widening we need to manually construct X86 specific + // nodes and an unpcklqdq. + Lo = DAG.getNode(X86ISD::VTRUNC, DL, VT, Lo); + Hi = DAG.getNode(X86ISD::VTRUNC, DL, VT, Hi); + + // Manually concat the truncates using a shuffle. + unsigned NumElts = VT.getVectorNumElements(); + SmallVector ShufMask(NumElts); + for (unsigned i = 0; i != NumElts / 2; ++i) + ShufMask[i] = i; + for (unsigned i = NumElts / 2; i != NumElts; ++i) + ShufMask[i] = i + (NumElts / 2); + return DAG.getVectorShuffle(VT, DL, Lo, Hi, ShufMask); + } + + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + + Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo); + Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); + } + + // Otherwise let default legalization handle it. return SDValue(); + } if (VT.getVectorElementType() == MVT::i1) return LowerTruncateVecI1(Op, DAG, Subtarget); Index: llvm/test/CodeGen/X86/min-legal-vector-width.ll =================================================================== --- llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw,avx512dq,prefer-256-bit | FileCheck %s +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw,avx512dq,prefer-256-bit | FileCheck %s ; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled. @@ -725,10 +726,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpmovdw %ymm0, %xmm0 -; CHECK-NEXT: vpmovdw %ymm1, %xmm1 -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vpmovwb %ymm0, %xmm0 +; CHECK-NEXT: vpmovdb %ymm1, %xmm1 +; CHECK-NEXT: vpmovdb %ymm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %a = load <16 x i32>, <16 x i32>* %x @@ -741,10 +741,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpmovqd %ymm0, %xmm0 -; CHECK-NEXT: vpmovqd %ymm1, %xmm1 -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vpmovdw %ymm0, %xmm0 +; CHECK-NEXT: vpmovqw %ymm1, %xmm1 +; CHECK-NEXT: vpmovqw %ymm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %a = load <8 x i64>, <8 x i64>* %x