Index: lib/CodeGen/MachineScheduler.cpp
===================================================================
--- lib/CodeGen/MachineScheduler.cpp
+++ lib/CodeGen/MachineScheduler.cpp
@@ -1434,17 +1434,22 @@
   // Check if either the dest or source is local. If it's live across a back
   // edge, it's not local. Note that if both vregs are live across the back
   // edge, we cannot successfully contrain the copy without cyclic scheduling.
+  // If both the source and the dest are local, make sure the part of the 
+  // "global" is played by a vreg that has at least one potential hole in
+  // its live interval.
   unsigned LocalReg = DstReg;
   unsigned GlobalReg = SrcReg;
   LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
-  if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
+  LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg);
+  if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx) || 
+       GlobalLI->size() == 1) {
     LocalReg = SrcReg;
     GlobalReg = DstReg;
     LocalLI = &LIS->getInterval(LocalReg);
     if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
       return;
+    GlobalLI = &LIS->getInterval(GlobalReg);
   }
-  LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg);
 
   // Find the global segment after the start of the local LI.
   LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex());
Index: test/CodeGen/X86/pr21972.ll
===================================================================
--- test/CodeGen/X86/pr21972.ll
+++ test/CodeGen/X86/pr21972.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=x86_64-linux -mcpu=corei7 < %s | FileCheck %s
+; This fixes a missing cases in the MI scheduler's constrainLocalCopy exposed by
+; PR21972
+
+@stuff = external constant [256 x double], align 16
+
+define void @func(<4 x float> %vx) {
+entry:
+  %tmp2 = bitcast <4 x float> %vx to <2 x i64>
+  %and.i = and <2 x i64> %tmp2, <i64 8727373547504, i64 8727373547504>
+  %tmp3 = bitcast <2 x i64> %and.i to <4 x i32>
+  %index.sroa.0.0.vec.extract = extractelement <4 x i32> %tmp3, i32 0
+  %idx.ext = sext i32 %index.sroa.0.0.vec.extract to i64
+  %add.ptr = getelementptr inbounds i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext
+  %tmp4 = bitcast i8* %add.ptr to double*
+  %index.sroa.0.4.vec.extract = extractelement <4 x i32> %tmp3, i32 1
+  %idx.ext5 = sext i32 %index.sroa.0.4.vec.extract to i64
+  %add.ptr6 = getelementptr inbounds i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext5
+  %tmp5 = bitcast i8* %add.ptr6 to double*
+  %index.sroa.0.8.vec.extract = extractelement <4 x i32> %tmp3, i32 2
+  %idx.ext14 = sext i32 %index.sroa.0.8.vec.extract to i64
+  %add.ptr15 = getelementptr inbounds i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext14
+  %tmp6 = bitcast i8* %add.ptr15 to double*
+  %index.sroa.0.12.vec.extract = extractelement <4 x i32> %tmp3, i32 3
+  %idx.ext19 = sext i32 %index.sroa.0.12.vec.extract to i64
+  %add.ptr20 = getelementptr inbounds i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext19
+  %tmp7 = bitcast i8* %add.ptr20 to double*
+  %add.ptr46 = getelementptr inbounds i8* bitcast (double* getelementptr inbounds ([256 x double]* @stuff, i64 0, i64 1) to i8*), i64 %idx.ext
+  %tmp16 = bitcast i8* %add.ptr46 to double*
+  %add.ptr51 = getelementptr inbounds i8* bitcast (double* getelementptr inbounds ([256 x double]* @stuff, i64 0, i64 1) to i8*), i64 %idx.ext5
+  %tmp17 = bitcast i8* %add.ptr51 to double*
+  call void @toto(double* %tmp4, double* %tmp5, double* %tmp6, double* %tmp7, double* %tmp16, double* %tmp17)
+  ret void
+; CHECK-LABEL: func:
+; CHECK: pextrq  $1, %xmm0,
+; CHECK-NEXT: movd    %xmm0, %r[[AX:..]]
+; CHECK-NEXT: movslq  %e[[AX]],
+; CHECK-NEXT: sarq    $32, %r[[AX]]
+}
+
+declare void @toto(double*, double*, double*, double*, double*, double*)
Index: test/CodeGen/X86/vector-idiv.ll
===================================================================
--- test/CodeGen/X86/vector-idiv.ll
+++ test/CodeGen/X86/vector-idiv.ll
@@ -841,19 +841,18 @@
 ; SSE-LABEL: test8:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    psrad $31, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm0, %xmm3
 ; SSE-NEXT:    psrad $31, %xmm3
 ; SSE-NEXT:    pand %xmm2, %xmm3
-; SSE-NEXT:    paddd %xmm1, %xmm3
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    pmuludq %xmm2, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE-NEXT:    pmuludq %xmm2, %xmm4
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    paddd %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
 ; SSE-NEXT:    psubd %xmm3, %xmm1
 ; SSE-NEXT:    paddd %xmm0, %xmm1