Index: a.c
===================================================================
--- /dev/null
+++ a.c
@@ -0,0 +1,46 @@
+#include <x86intrin.h>
+
+typedef float __attribute__((__vector_size__(16))) vec_float4;
+typedef float __attribute__((__vector_size__(8))) vec_float2;
+typedef const vec_float4 vec_float4_arg;
+typedef const vec_float2 vec_float2_arg;
+
+vec_float2 add2f(vec_float2_arg a, vec_float2_arg b) {
+  return a + b; // good
+}
+
+vec_float4 add2f_0(vec_float2_arg a0, vec_float2_arg b0, vec_float2_arg a1,
+                   vec_float2_arg b1) {
+  union {
+    vec_float2 f2[0];
+    vec_float4 f4;
+  } a, b;
+
+  a.f2[0] = a0;
+  a.f2[1] = a1;
+  b.f2[0] = b0;
+  b.f2[1] = b1;
+
+  return a.f4 + b.f4; // bad - to/from gpr
+}
+
+vec_float4 add2f_1(vec_float2_arg a0, vec_float2_arg b0, vec_float2_arg a1,
+                   vec_float2_arg b1) {
+  union {
+    vec_float2 f2[0];
+    vec_float4 f4;
+  } c;
+
+  c.f2[0] = a0 + b0;
+  c.f2[1] = a1 + b1;
+  return c
+      .f4; // bad (ish) - workaround for add2f_0 but requires 2 adds + 1 shuffle
+}
+
+vec_float4 add2f_2(vec_float2_arg a0, vec_float2_arg b0, vec_float2_arg a1,
+                   vec_float2_arg b1) {
+  vec_float4 a = __builtin_shufflevector(a0, a1, 0, 1, 2, 3);
+  vec_float4 b = __builtin_shufflevector(b0, b1, 0, 1, 2, 3);
+  return a + b; // good - but only because we manipulate shuffles directly - 1
+                // add + 2 shuffles
+}
Index: add-packed.ll
===================================================================
--- /dev/null
+++ add-packed.ll
@@ -0,0 +1,72 @@
+; ModuleID = 'a.c'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Function Attrs: nounwind readnone ssp uwtable
+define double @add2f(double %a.coerce, double %b.coerce) #0 {
+  %1 = bitcast double %a.coerce to <2 x float>
+  %2 = bitcast double %b.coerce to <2 x float>
+  %3 = fadd <2 x float> %1, %2
+  %4 = bitcast <2 x float> %3 to double
+  ret double %4
+}
+
+define <4 x float> @add2f_042(<2 x float> %a0.coerce, <2 x float> %a1.coerce, <4 x float> %b1.coerce) #0 {
+  %1 = bitcast <2 x float> %a0.coerce to <2 x float>
+  %2 = bitcast <2 x float> %a1.coerce to <2 x float>
+  %3 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %4 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+  %5 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %4, <4 x float> %3
+  %6 = shufflevector <4 x float> %b1.coerce, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+  %7 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %6, <4 x float> %3
+  %8 = fadd <4 x float> %5, %7
+  ret <4 x float> %8
+}
+
+;; Function Attrs: nounwind readnone ssp uwtable
+;define <4 x float> @add2f_0(<2 x float> %a0.coerce, <2 x float> %b0.coerce, <2 x float> %a1.coerce, <2 x float> %b1.coerce) #0 {
+;  %1 = bitcast <2 x float> %a0.coerce to <2 x float>
+;  %2 = bitcast <2 x float> %b0.coerce to <2 x float>
+;  %3 = bitcast <2 x float> %a1.coerce to <2 x float>
+;  %4 = bitcast <2 x float> %b1.coerce to <2 x float>
+;  %5 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+;  %6 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+;  %7 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %6, <4 x float> %5
+;  %8 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+;  %9 = shufflevector <2 x float> %4, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+;  %10 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %9, <4 x float> %8
+;  %11 = fadd <4 x float> %7, %10
+;  ret <4 x float> %11
+;}
+;
+;; Function Attrs: nounwind readnone ssp uwtable
+;define <4 x float> @add2f_1(double %a0.coerce, double %b0.coerce, double %a1.coerce, double %b1.coerce) #0 {
+;  %1 = bitcast double %a0.coerce to <2 x float>
+;  %2 = bitcast double %b0.coerce to <2 x float>
+;  %3 = bitcast double %a1.coerce to <2 x float>
+;  %4 = bitcast double %b1.coerce to <2 x float>
+;  %5 = fadd <2 x float> %1, %2
+;  %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+;  %7 = fadd <2 x float> %3, %4
+;  %8 = shufflevector <2 x float> %7, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+;  %9 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %8, <4 x float> %6
+;  ret <4 x float> %9
+;}
+;
+;; Function Attrs: nounwind readnone ssp uwtable
+;define <4 x float> @add2f_2(double %a0.coerce, double %b0.coerce, double %a1.coerce, double %b1.coerce) #0 {
+;  %1 = bitcast double %a0.coerce to <2 x float>
+;  %2 = bitcast double %b0.coerce to <2 x float>
+;  %3 = bitcast double %a1.coerce to <2 x float>
+;  %4 = bitcast double %b1.coerce to <2 x float>
+;  %5 = shufflevector <2 x float> %1, <2 x float> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;  %6 = shufflevector <2 x float> %2, <2 x float> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;  %7 = fadd <4 x float> %5, %6
+;  ret <4 x float> %7
+;}
+;
+attributes #0 = { nounwind readnone ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5 "}
Index: insertps.after.s
===================================================================
--- /dev/null
+++ insertps.after.s
@@ -0,0 +1,49 @@
+	.section	__TEXT,__text,regular,pure_instructions
+	.macosx_version_min 10, 9
+	.globl	_insertWa
+	.align	4, 0x90
+_insertWa:                              ## @insertWa
+	.cfi_startproc
+## BB#0:
+	vinsertps	$48, 192(%rdi), %xmm0, %xmm0
+	retq
+	.cfi_endproc
+
+	.globl	_insertWb
+	.align	4, 0x90
+_insertWb:                              ## @insertWb
+	.cfi_startproc
+## BB#0:
+	vinsertps	$48, (%rdi), %xmm0, %xmm0
+	retq
+	.cfi_endproc
+
+	.globl	_insertWc
+	.align	4, 0x90
+_insertWc:                              ## @insertWc
+	.cfi_startproc
+## BB#0:
+	vinsertps	$48, 12(%rdi), %xmm0, %xmm0
+	retq
+	.cfi_endproc
+
+	.globl	_insertWd
+	.align	4, 0x90
+_insertWd:                              ## @insertWd
+	.cfi_startproc
+## BB#0:
+	vinsertps	$48, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],xmm1[0]
+	retq
+	.cfi_endproc
+
+	.globl	_insertWe
+	.align	4, 0x90
+_insertWe:                              ## @insertWe
+	.cfi_startproc
+## BB#0:
+	vinsertps	$48, (%rdi,%rsi,4), %xmm0, %xmm0
+	retq
+	.cfi_endproc
+
+
+.subsections_via_symbols
Index: insertps.before.s
===================================================================
--- /dev/null
+++ insertps.before.s
@@ -0,0 +1,51 @@
+	.section	__TEXT,__text,regular,pure_instructions
+	.macosx_version_min 10, 9
+	.globl	_insertWa
+	.align	4, 0x90
+_insertWa:                              ## @insertWa
+	.cfi_startproc
+## BB#0:
+	vmovaps	(%rdi), %xmm1
+	vinsertps	$48, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],xmm1[0]
+	retq
+	.cfi_endproc
+
+	.globl	_insertWb
+	.align	4, 0x90
+_insertWb:                              ## @insertWb
+	.cfi_startproc
+## BB#0:
+	vinsertps	$48, (%rdi), %xmm0, %xmm0
+	retq
+	.cfi_endproc
+
+	.globl	_insertWc
+	.align	4, 0x90
+_insertWc:                              ## @insertWc
+	.cfi_startproc
+## BB#0:
+	vinsertps	$48, 12(%rdi), %xmm0, %xmm0
+	retq
+	.cfi_endproc
+
+	.globl	_insertWd
+	.align	4, 0x90
+_insertWd:                              ## @insertWd
+	.cfi_startproc
+## BB#0:
+	vinsertps	$48, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],xmm1[0]
+	retq
+	.cfi_endproc
+
+	.globl	_insertWe
+	.align	4, 0x90
+_insertWe:                              ## @insertWe
+	.cfi_startproc
+## BB#0:
+	vbroadcastss	(%rdi,%rsi,4), %xmm1
+	vinsertps	$48, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],xmm1[0]
+	retq
+	.cfi_endproc
+
+
+.subsections_via_symbols
Index: insertps.ll
===================================================================
--- /dev/null
+++ insertps.ll
@@ -0,0 +1,77 @@
+; ModuleID = '/Users/filipe/work/tests/pcg/31474.c'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Function Attrs: nounwind readonly ssp uwtable
+define <4 x float> @insertWa(<4 x float> %a, <4 x float>* nocapture readonly %pb) #0 {
+  %1 = load <4 x float>* %pb, align 16, !tbaa !1
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
+  ret <4 x float> %2
+}
+
+; Function Attrs: nounwind readonly ssp uwtable
+define <4 x float> @insertWe(<4 x float> %a, float* nocapture readonly %fb, i64 %index) #0 {
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4, !tbaa !4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) #1
+
+; Function Attrs: nounwind readonly ssp uwtable
+define <4 x float> @insertWb(<4 x float> %a, float* nocapture readonly %pb) #0 {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %3 = extractelement <4 x float> %a, i32 1
+  %4 = insertelement <4 x float> %2, float %3, i32 1
+  %5 = extractelement <4 x float> %a, i32 2
+  %6 = insertelement <4 x float> %4, float %5, i32 2
+  %7 = load float* %pb, align 4, !tbaa !4
+  %8 = insertelement <4 x float> %6, float %7, i32 3
+  ret <4 x float> %8
+}
+
+; Function Attrs: nounwind readonly ssp uwtable
+define <4 x float> @insertWc(<4 x float> %a, <4 x float>* nocapture readonly %pb) #0 {
+  %1 = load <4 x float>* %pb, align 16, !tbaa !1
+  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %2
+}
+
+; Function Attrs: nounwind readnone ssp uwtable
+define <4 x float> @insertWd(<4 x float> %a, <4 x float> %b) #2 {
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %1
+}
+
+; Function Attrs: nounwind readonly ssp uwtable
+define <4 x float> @insertWe(<4 x float> %a, float* nocapture readonly %fb, i64 %index) #0 {
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4, !tbaa !4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+attributes #0 = { nounwind readonly ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readnone ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5 "}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"omnipotent char", metadata !3, i64 0}
+!3 = metadata !{metadata !"Simple C/C++ TBAA"}
+!4 = metadata !{metadata !5, metadata !5, i64 0}
+!5 = metadata !{metadata !"float", metadata !2, i64 0}
Index: lib/Transforms/InstCombine/InstCombineSelect.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -731,6 +731,63 @@
   return V;
 }
 
+static bool CanFoldSelectOfShuffleVectors(SelectInst &SI, Value *CondVal,
+                                          Value *TrueVal, Value *FalseVal) {
+  VectorType *VecTy = dyn_cast<VectorType>(CondVal->getType());
+  ShuffleVectorInst *TrueSV = dyn_cast<ShuffleVectorInst>(TrueVal);
+  ShuffleVectorInst *FalseSV = dyn_cast<ShuffleVectorInst>(FalseVal);
+  if (!VecTy && !TrueSV && !FalseSV)
+    return false;
+
+  ConstantVector *CondV = dyn_cast<ConstantVector>(CondVal);
+  if (!CondV)
+    return false;
+
+  Value *TrueV2 = TrueSV->getOperand(1);
+  Value *FalseV2 = FalseSV->getOperand(1);
+  // We just check for *V2 being undef since instcombine will turn
+  // shufflevector(undef, v) into shufflevector(v, undef)
+  if (!(isa<UndefValue>(TrueV2) && isa<UndefValue>(FalseV2)))
+    return false;
+
+  // The source vectors (not the mask) for the shuffle vectors have to
+  // have the same type. Otherwise we could end up trying to do a
+  // shufflevector <4 x i32> <2 x i32>
+  Type *TrueShuffleSrc = TrueV2->getType();
+  Type *FalseShuffleSrc = FalseV2->getType();
+  if (TrueShuffleSrc != FalseShuffleSrc)
+    return false;
+
+  return true;
+}
+
+// This is instruction is only safe to call if CanFoldSelectOfShuffleVectors is
+// true.
+static Instruction *FoldSelectOfShuffleVectors(SelectInst &SI) {
+  SmallVector<uint32_t, 16> ShuffleMask;
+  ShuffleVectorInst *Sources[] = { cast<ShuffleVectorInst>(SI.getFalseValue()),
+                                   cast<ShuffleVectorInst>(SI.getTrueValue()) };
+  ConstantVector *CondV = cast<ConstantVector>(SI.getCondition());
+
+  unsigned NumElems = cast<VectorType>(SI.getType())->getNumElements();
+  unsigned NumSourceElems =
+      cast<VectorType>(Sources[0]->getOperand(0)->getType())->getNumElements();
+  for (unsigned i = 0; i < NumElems; ++i) {
+    ConstantInt *Element = dyn_cast<ConstantInt>(CondV->getAggregateElement(i));
+    if (!Element)
+      return nullptr;
+
+    int Selector = Element->isOne();
+    int SourceIdx = Sources[Selector]->getMaskValue(i);
+    ShuffleMask.push_back(SourceIdx == -1 ? -1 : SourceIdx +
+                                                     NumSourceElems * Selector);
+  }
+
+  return new ShuffleVectorInst(
+      Sources[0]->getOperand(0), Sources[1]->getOperand(0),
+      ConstantDataVector::get(SI.getContext(), ShuffleMask));
+}
+
 Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   Value *CondVal = SI.getCondition();
   Value *TrueVal = SI.getTrueValue();
@@ -1005,6 +1062,13 @@
     if (isa<ConstantAggregateZero>(CondVal)) {
       return ReplaceInstUsesWith(SI, FalseVal);
     }
+
+    // If all the elements with true in the mask correspond to undef on
+    // FalseV, and the reverse is true with false in the mask and TrueV, we
+    // can merge the shufflevectors and remove the select.
+    if (CanFoldSelectOfShuffleVectors(SI, CondVal, TrueVal, FalseVal))
+      if (Instruction *I = FoldSelectOfShuffleVectors(SI))
+        return I;
   }
 
   return nullptr;
Index: test/Transforms/InstCombine/select.ll
===================================================================
--- test/Transforms/InstCombine/select.ll
+++ test/Transforms/InstCombine/select.ll
@@ -1031,3 +1031,29 @@
 ; CHECK: lshr exact i32 %2, 1
 ; CHECK: xor i32 %3, 42
 }
+
+define <4 x float> @add2f_0(<2 x float> %a0, <2 x float> %b0, <2 x float> %a1, <2 x float> %b1) {
+; CHECK-LABEL: @add2f_0
+; CHECK-NOT: select
+; CHECK: ret
+  %1 = shufflevector <2 x float> %a0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %2 = shufflevector <2 x float> %a1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+  %3 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %2, <4 x float> %1
+  %4 = shufflevector <2 x float> %b0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %5 = shufflevector <2 x float> %b1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+  %6 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %5, <4 x float> %4
+  %7 = fadd <4 x float> %3, %6
+  ret <4 x float> %7
+}
+
+define <4 x float> @add2f_1(<2 x float> %a0, <2 x float> %b0, <2 x float> %a1, <2 x float> %b1) {
+; CHECK-LABEL: @add2f_1
+; CHECK-NOT: select
+; CHECK: ret
+  %1 = fadd <2 x float> %a0, %b0
+  %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %3 = fadd <2 x float> %a1, %b1
+  %4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+  %5 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %4, <4 x float> %2
+  ret <4 x float> %5
+}