Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1777,6 +1777,9 @@
         setOperationAction(ISD::FSHR, VT, Custom);
       }
     }
+
+    setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
   }
 
   // We want to custom lower some of our intrinsics.
@@ -19041,8 +19044,42 @@
          "Invalid TRUNCATE operation");
 
   // If called by the legalizer just return.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) {
+    if ((InVT == MVT::v8i64 || InVT == MVT::v16i32) && VT.is128BitVector()) {
+      assert(Subtarget.hasVLX() && "Unexpected subtarget!");
+      // The default behavior is to truncate one step, concatenate, and then
+      // truncate the remainder. We'd rather produce two 64-bit results and
+      // concatenate those.
+      SDValue Lo, Hi;
+      std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
+
+      if (!ExperimentalVectorWideningLegalization) {
+        // Without vector widening we need to manually construct X86 specific
+        // nodes and an unpcklqdq.
+        Lo = DAG.getNode(X86ISD::VTRUNC, DL, VT, Lo);
+        Hi = DAG.getNode(X86ISD::VTRUNC, DL, VT, Hi);
+
+        // Manually concat the truncates using a shuffle.
+        unsigned NumElts = VT.getVectorNumElements();
+        SmallVector<int, 16> ShufMask(NumElts);
+        for (unsigned i = 0; i != NumElts / 2; ++i)
+          ShufMask[i] = i;
+        for (unsigned i = NumElts / 2; i != NumElts; ++i)
+          ShufMask[i] = i + (NumElts / 2);
+        return DAG.getVectorShuffle(VT, DL, Lo, Hi, ShufMask);
+      }
+
+      EVT LoVT, HiVT;
+      std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+      Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
+      Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+    }
+
+    // Otherwise let default legalization handle it.
     return SDValue();
+  }
 
   if (VT.getVectorElementType() == MVT::i1)
     return LowerTruncateVecI1(Op, DAG, Subtarget);
Index: llvm/test/CodeGen/X86/min-legal-vector-width.ll
===================================================================
--- llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw,avx512dq,prefer-256-bit | FileCheck %s
+; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw,avx512dq,prefer-256-bit | FileCheck %s
 
 ; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled.
 
@@ -725,10 +726,9 @@
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
 ; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
-; CHECK-NEXT:    vpmovdw %ymm0, %xmm0
-; CHECK-NEXT:    vpmovdw %ymm1, %xmm1
-; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpmovwb %ymm0, %xmm0
+; CHECK-NEXT:    vpmovdb %ymm1, %xmm1
+; CHECK-NEXT:    vpmovdb %ymm0, %xmm0
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %a = load <16 x i32>, <16 x i32>* %x
@@ -741,10 +741,9 @@
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
 ; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
-; CHECK-NEXT:    vpmovqd %ymm0, %xmm0
-; CHECK-NEXT:    vpmovqd %ymm1, %xmm1
-; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpmovdw %ymm0, %xmm0
+; CHECK-NEXT:    vpmovqw %ymm1, %xmm1
+; CHECK-NEXT:    vpmovqw %ymm0, %xmm0
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %a = load <8 x i64>, <8 x i64>* %x