Index: include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
===================================================================
--- include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -193,31 +193,27 @@
 
   static unsigned canFoldMergeOpcode(unsigned MergeOp, unsigned ConvertOp,
                                      LLT OpTy, LLT DestTy) {
-    if (OpTy.isVector() && DestTy.isVector())
-      return MergeOp == TargetOpcode::G_CONCAT_VECTORS;
-
-    if (OpTy.isVector() && !DestTy.isVector()) {
-      if (MergeOp == TargetOpcode::G_BUILD_VECTOR)
+    // Check if we found a definition that is like G_MERGE_VALUES.
+    switch (MergeOp) {
+    default:
+      return false;
+    case TargetOpcode::G_BUILD_VECTOR:
+    case TargetOpcode::G_MERGE_VALUES:
+      return true;
+    case TargetOpcode::G_CONCAT_VECTORS: {
+      if (ConvertOp == 0)
         return true;
 
-      if (MergeOp == TargetOpcode::G_CONCAT_VECTORS) {
-        if (ConvertOp == 0)
-          return true;
-
-        const unsigned OpEltSize = OpTy.getElementType().getSizeInBits();
-
-        // Don't handle scalarization with a cast that isn't in the same
-        // direction as the vector cast. This could be handled, but it would
-        // require more intermediate unmerges.
-        if (ConvertOp == TargetOpcode::G_TRUNC)
-          return DestTy.getSizeInBits() <= OpEltSize;
-        return DestTy.getSizeInBits() >= OpEltSize;
-      }
+      const unsigned OpEltSize = OpTy.getElementType().getSizeInBits();
 
-      return false;
+      // Don't handle scalarization with a cast that isn't in the same
+      // direction as the vector cast. This could be handled, but it would
+      // require more intermediate unmerges.
+      if (ConvertOp == TargetOpcode::G_TRUNC)
+        return DestTy.getSizeInBits() <= OpEltSize;
+      return DestTy.getSizeInBits() >= OpEltSize;
+    }
     }
-
-    return MergeOp == TargetOpcode::G_MERGE_VALUES;
   }
 
   bool tryCombineMerges(MachineInstr &MI,
@@ -309,6 +305,10 @@
 
     } else {
       LLT MergeSrcTy = MRI.getType(MergeI->getOperand(1).getReg());
+
+      if (!ConvertOp && DestTy != MergeSrcTy)
+        ConvertOp = TargetOpcode::G_BITCAST;
+
       if (ConvertOp) {
         Builder.setInstr(MI);
 
@@ -321,10 +321,10 @@
         markInstAndDefDead(MI, *MergeI, DeadInsts);
         return true;
       }
-      // FIXME: is a COPY appropriate if the types mismatch? We know both
-      // registers are allocatable by now.
-      if (DestTy != MergeSrcTy)
-        return false;
+
+      assert(DestTy == MergeSrcTy &&
+             "Bitcast and the other kinds of conversions should "
+             "have happened earlier");
 
       for (unsigned Idx = 0; Idx < NumDefs; ++Idx)
         MRI.replaceRegWith(MI.getOperand(Idx).getReg(),
Index: test/CodeGen/AArch64/GlobalISel/integration-shuffle-vector.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/GlobalISel/integration-shuffle-vector.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -mtriple aarch64-apple-ios -stop-after=instruction-select %s -o - | FileCheck %s
+
+; Check that packing incoming arguments into a big vector type
+; and unpacking them in registers for the call to @bar gets selected as just
+; simple copies. I.e., we don't artificial try to keep the big
+; vector (%vec) alive.
+define void @shuffle_to_concat_vector(<2 x i64> %a, <2 x i64> %b) {
+  ; CHECK-LABEL: name: shuffle_to_concat_vector
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $q0, $q1
+  ; CHECK:   [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+  ; CHECK:   [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   $q0 = COPY [[COPY]]
+  ; CHECK:   $q1 = COPY [[COPY1]]
+  ; CHECK:   BL @bar, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $q0, implicit $q1
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   RET_ReallyLR
+  %vec = shufflevector <2 x i64> %a, <2 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  call void @bar(<4 x i64> %vec)
+  ret void
+}
+
+declare void @bar(<4 x i64> %vec)
Index: test/CodeGen/AArch64/GlobalISel/legalizer-combiner.mir
===================================================================
--- test/CodeGen/AArch64/GlobalISel/legalizer-combiner.mir
+++ test/CodeGen/AArch64/GlobalISel/legalizer-combiner.mir
@@ -1,15 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_unmerge() {
-  entry:
-    ret void
-  }
-  define void @test_legal_const_ext() { ret void }
-...
+# RUN: llc -O0 -mtriple aarch64-- -run-pass=legalizer %s -o - | FileCheck %s
 
 ---
 name:            test_unmerge
@@ -44,3 +34,60 @@
     %4:_(s32) = G_ANYEXT %3(s1)
     $w0 = COPY %4(s32)
 ...
+
+# Check that the artifact combiner can get rid of the big
+# vector type (4 x s64) by combining the G_UNMERGE_VALUES
+# with the G_CONCAT_VECTORS and turning that into bitcast.
+---
+name:            concat_vectors_unmerge_to_bitcast
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: concat_vectors_unmerge_to_bitcast
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[COPY]](<2 x s64>)
+    ; CHECK: [[BITCAST1:%[0-9]+]]:_(s128) = G_BITCAST [[COPY1]](<2 x s64>)
+    ; CHECK: $q0 = COPY [[BITCAST]](s128)
+    ; CHECK: $q1 = COPY [[BITCAST1]](s128)
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(<2 x s64>) = COPY $q1
+    %2:_(<4 x s64>) = G_CONCAT_VECTORS %0(<2 x s64>), %1(<2 x s64>)
+    %3:_(s128), %4:_(s128) = G_UNMERGE_VALUES %2(<4 x s64>)
+    $q0 = COPY %3(s128)
+    $q1 = COPY %4(s128)
+...
+
+# Check that the artifact combiner can get rid of the big
+# vector type (4 x s64) by combining the G_UNMERGE_VALUES
+# with the G_CONCAT_VECTORS and turning that into smaller
+# 2x64-bit G_UNMERGE_VALUES.
+---
+name:            concat_vectors_unmerge_to_unmerge
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: concat_vectors_unmerge_to_unmerge
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
+    ; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
+    ; CHECK: $x0 = COPY [[UV]](s64)
+    ; CHECK: $x1 = COPY [[UV1]](s64)
+    ; CHECK: $x2 = COPY [[UV2]](s64)
+    ; CHECK: $x3 = COPY [[UV3]](s64)
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(<2 x s64>) = COPY $q1
+    %2:_(<4 x s64>) = G_CONCAT_VECTORS %0(<2 x s64>), %1(<2 x s64>)
+    %3:_(s64), %4:_(s64), %5:_(s64), %6:_(s64) = G_UNMERGE_VALUES %2(<4 x s64>)
+    $x0 = COPY %3(s64)
+    $x1 = COPY %4(s64)
+    $x2 = COPY %5(s64)
+    $x3 = COPY %6(s64)
+...