Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13318,6 +13318,11 @@
   if (!TLI.isOperationLegalOrCustom(Opcode, NVT))
     return SDValue();
 
+  // Check operation result VT (there may be less support for a particular fp
+  // width).
+  if (!TLI.isOperationLegalOrCustom(Opcode, VT))
+    return SDValue();
+
   // Just because the floating-point vector type is legal does not necessarily
   // mean that the corresponding integer vector type is.
   if (!isTypeLegal(NVT))
Index: lib/Target/SystemZ/SystemZISelLowering.cpp
===================================================================
--- lib/Target/SystemZ/SystemZISelLowering.cpp
+++ lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -449,6 +449,16 @@
   setTargetDAGCombine(ISD::SRL);
   setTargetDAGCombine(ISD::ROTL);
 
+  // Scalarize v2f32 early, to avoid later expansion to 4 operations (see
+  // comment in PerformDAGCombine).
+  SmallVector<ISD::NodeType, 12> FP32Ops =
+    {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV, ISD::FREM, ISD::SINT_TO_FP,
+     ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT};
+  for (auto Op : FP32Ops) {
+    setTargetDAGCombine(Op);
+    setOperationAction(Op, MVT::v2f32, Expand);
+  }
+
   // Handle intrinsics.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -5184,7 +5194,20 @@
 SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   switch(N->getOpcode()) {
-  default: break;
+  default:
+    // Z13 can handle fp32 vectors in registers and memory, but does not
+    // support any vector operations on them. v2f32 is widened to v4f32 and
+    // kept in a single vector register, which is better when only memory
+    // operations are involved. Any operations on v2f32 should be scalarized
+    // before type legalization, or else all four operations will actually be
+    // emitted.
+    if (N->getValueType(0) == MVT::v2f32 ||
+        ((N->getOpcode() == ISD::FP_TO_SINT || N->getOpcode() == ISD::FP_TO_UINT) &&
+         (N->getOperand(0)->getValueType(0) == MVT::v2f32 &&
+          N->getValueType(0) == MVT::v2i32)))
+      return DCI.DAG.UnrollVectorOp(N, 2);
+
+    break;
   case ISD::SIGN_EXTEND:        return combineSIGN_EXTEND(N, DCI);
   case SystemZISD::MERGE_HIGH:
   case SystemZISD::MERGE_LOW:   return combineMERGE(N, DCI);
Index: test/CodeGen/SystemZ/fp32-vec-conv.ll
===================================================================
--- /dev/null
+++ test/CodeGen/SystemZ/fp32-vec-conv.ll
@@ -0,0 +1,41 @@
+; Test that a vector of two floats only generates two instructions (and not
+; four).
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+
+define <2 x float> @fun0(<2 x i32> %val1) {
+; CHECK-LABEL: fun0:
+; CHECK: 	celfbr
+; CHECK: 	celfbr
+; CHECK-NOT: 	celfbr
+  %z = uitofp <2 x i32> %val1 to <2 x float>
+  ret <2 x float> %z
+}
+
+define <2 x float> @fun1(<2 x i32> %val1) {
+; CHECK-LABEL: fun1:
+; CHECK: 	cefbr
+; CHECK: 	cefbr
+; CHECK-NOT: 	cefbr
+  %z = sitofp <2 x i32> %val1 to <2 x float>
+  ret <2 x float> %z
+}
+
+define <2 x i32> @fun2(<2 x float> %val1) {
+; CHECK-LABEL: fun2:
+; CHECK: 	cfebr
+; CHECK: 	cfebr
+; CHECK-NOT: 	cfebr
+  %z = fptosi <2 x float> %val1 to <2 x i32>
+  ret <2 x i32> %z
+}
+
+define <2 x i32> @fun3(<2 x float> %val1) {
+; CHECK-LABEL: fun3:
+; CHECK: 	clfebr
+; CHECK: 	clfebr
+; CHECK-NOT: 	clfebr
+  %z = fptoui <2 x float> %val1 to <2 x i32>
+  ret <2 x i32> %z
+}
Index: test/CodeGen/SystemZ/fp32-vec-ops.ll
===================================================================
--- /dev/null
+++ test/CodeGen/SystemZ/fp32-vec-ops.ll
@@ -0,0 +1,49 @@
+; Test that a vector of two floats only generates two instructions (and not
+; four).
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+define <2 x float> @fun0(<2 x float> %val1, <2 x float> %val2) {
+; CHECK-LABEL: fun0:
+; CHECK: 	aebr
+; CHECK: 	aebr
+; CHECK-NOT: 	aebr
+  %ret = fadd <2 x float> %val1, %val2
+  ret <2 x float> %ret
+}
+
+define <2 x float> @fun1(<2 x float> %val1, <2 x float> %val2) {
+; CHECK-LABEL: fun1:
+; CHECK: 	sebr
+; CHECK: 	sebr
+; CHECK-NOT: 	sebr
+  %ret = fsub <2 x float> %val1, %val2
+  ret <2 x float> %ret
+}
+
+define <2 x float> @fun2(<2 x float> %val1, <2 x float> %val2) {
+; CHECK-LABEL: fun2:
+; CHECK: 	meebr
+; CHECK: 	meebr
+; CHECK-NOT: 	meebr
+  %ret = fmul <2 x float> %val1, %val2
+  ret <2 x float> %ret
+}
+
+define <2 x float> @fun3(<2 x float> %val1, <2 x float> %val2) {
+; CHECK-LABEL: fun3:
+; CHECK: 	debr
+; CHECK: 	debr
+; CHECK-NOT: 	debr
+  %ret = fdiv <2 x float> %val1, %val2
+  ret <2 x float> %ret
+}
+
+define <2 x float> @fun4(<2 x float> %val1, <2 x float> %val2) {
+; CHECK-LABEL: fun4:
+; CHECK: 	brasl	%r14, fmodf@PLT
+; CHECK: 	brasl	%r14, fmodf@PLT
+; CHECK-NOT: 	brasl	%r14, fmodf@PLT
+  %ret = frem <2 x float> %val1, %val2
+  ret <2 x float> %ret
+}
Index: test/CodeGen/X86/cvtv2f32.ll
===================================================================
--- test/CodeGen/X86/cvtv2f32.ll
+++ test/CodeGen/X86/cvtv2f32.ll
@@ -8,26 +8,27 @@
 define <2 x float> @uitofp_2i32_buildvector(i32 %x, i32 %y, <2 x float> %v) {
 ; X32-LABEL: uitofp_2i32_buildvector:
 ; X32:       # BB#0:
-; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
-; X32-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; X32-NEXT:    psrld $16, %xmm1
-; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2],mem[3],xmm1[4],mem[5],xmm1[6],mem[7]
-; X32-NEXT:    addps {{\.LCPI.*}}, %xmm1
-; X32-NEXT:    addps %xmm2, %xmm1
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; X32-NEXT:    orpd %xmm2, %xmm1
+; X32-NEXT:    subsd %xmm2, %xmm1
+; X32-NEXT:    cvtsd2ss %xmm1, %xmm1
+; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT:    orpd %xmm2, %xmm3
+; X32-NEXT:    subsd %xmm2, %xmm3
+; X32-NEXT:    xorps %xmm2, %xmm2
+; X32-NEXT:    cvtsd2ss %xmm3, %xmm2
+; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; X32-NEXT:    mulps %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: uitofp_2i32_buildvector:
 ; X64:       # BB#0:
-; X64-NEXT:    movd %edi, %xmm1
-; X64-NEXT:    pinsrd $1, %esi, %xmm1
-; X64-NEXT:    movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
-; X64-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; X64-NEXT:    psrld $16, %xmm1
-; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2],mem[3],xmm1[4],mem[5],xmm1[6],mem[7]
-; X64-NEXT:    addps {{.*}}(%rip), %xmm1
-; X64-NEXT:    addps %xmm2, %xmm1
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cvtsi2ssq %rax, %xmm1
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cvtsi2ssq %rax, %xmm2
+; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; X64-NEXT:    mulps %xmm1, %xmm0
 ; X64-NEXT:    retq
   %t1 = uitofp i32 %x to float
Index: test_CodeGen_ARM_vdup.diff
===================================================================
--- /dev/null
+++ test_CodeGen_ARM_vdup.diff
@@ -0,0 +1,98 @@
+--- vdup.trunk.s	2017-02-23 14:41:45.000000000 +0200
++++ vdup.patch.s	2017-02-23 14:41:27.000000000 +0200
+@@ -588,76 +588,77 @@
+ 	vmov	r0, r1, d16
+ 	mov	pc, lr
+ .Lfunc_end35:
+ 	.size	check_i8, .Lfunc_end35-check_i8
+ 	.cantunwind
+ 	.fnend
+ 
+ 	.globl	check_spr_splat2
+ 	.p2align	2
+ 	.type	check_spr_splat2,%function
+ 	.code	32                      @ @check_spr_splat2
+ check_spr_splat2:
+ 	.fnstart
+ @ BB#0:
+ 	lsl	r2, r2, #16
+-	vmov	d17, r0, r1
++	vmov	d16, r0, r1
+ 	asr	r2, r2, #16
+-	vdup.32	d16, r2
+-	vcvt.f32.s32	d16, d16
+-	vsub.f32	d16, d16, d17
++	vmov	s0, r2
++	vcvt.f32.s32	s0, s0
++	vdup.32	d17, d0[0]
++	vsub.f32	d16, d17, d16
+ 	vmov	r0, r1, d16
+ 	mov	pc, lr
+ .Lfunc_end36:
+ 	.size	check_spr_splat2, .Lfunc_end36-check_spr_splat2
+ 	.fnend
+ 
+ 	.globl	check_spr_splat4
+ 	.p2align	2
+ 	.type	check_spr_splat4,%function
+ 	.code	32                      @ @check_spr_splat4
+ check_spr_splat4:
+ 	.fnstart
+ @ BB#0:
+-	mov	r12, sp
+-	vmov	d19, r2, r3
+-	vld1.16	{d16[]}, [r12:16]
+-	vmov	d18, r0, r1
+-	vmovl.s16	q8, d16
+-	vcvt.f32.s32	q8, q8
+-	vsub.f32	q8, q8, q9
++	ldrsh	r12, [sp]
++	vmov	d17, r2, r3
++	vmov	d16, r0, r1
++	vmov	s0, r12
++	vcvt.f32.s32	s0, s0
++	vdup.32	q9, d0[0]
++	vsub.f32	q8, q9, q8
+ 	vmov	r0, r1, d16
+ 	vmov	r2, r3, d17
+ 	mov	pc, lr
+ .Lfunc_end37:
+ 	.size	check_spr_splat4, .Lfunc_end37-check_spr_splat4
+ 	.fnend
+ 
+ 	.globl	check_spr_splat4_lane1
+ 	.p2align	2
+ 	.type	check_spr_splat4_lane1,%function
+ 	.code	32                      @ @check_spr_splat4_lane1
+ check_spr_splat4_lane1:
+ 	.fnstart
+ @ BB#0:
+-	mov	r12, sp
+-	vmov	d19, r2, r3
+-	vld1.16	{d16[]}, [r12:16]
+-	vmov	d18, r0, r1
+-	vmovl.s16	q8, d16
+-	vcvt.f32.s32	q8, q8
+-	vsub.f32	q8, q8, q9
++	ldrsh	r12, [sp]
++	vmov	d17, r2, r3
++	vmov	d16, r0, r1
++	vmov	s0, r12
++	vcvt.f32.s32	s0, s0
++	vdup.32	q9, d0[0]
++	vsub.f32	q8, q9, q8
+ 	vmov	r0, r1, d16
+ 	vmov	r2, r3, d17
+ 	mov	pc, lr
+ .Lfunc_end38:
+ 	.size	check_spr_splat4_lane1, .Lfunc_end38-check_spr_splat4_lane1
+ 	.fnend
+ 
+ 	.globl	check_i8_varidx
+ 	.p2align	2
+ 	.type	check_i8_varidx,%function
+ 	.code	32                      @ @check_i8_varidx
+ check_i8_varidx:
+ 	.fnstart
+ @ BB#0:
+ 	.save	{r11}
Index: test_CodeGen_X86_MLICMbug.diff
===================================================================
--- /dev/null
+++ test_CodeGen_X86_MLICMbug.diff
@@ -0,0 +1,73 @@
+--- 2009-02-26-MachineLICMBug.trunk.s	2017-02-23 14:56:54.000000000 +0200
++++ 2009-02-26-MachineLICMBug.patch.s	2017-02-23 14:56:45.000000000 +0200
+@@ -1,59 +1,43 @@
+ 	.section	__TEXT,__text,regular,pure_instructions
+ 	.macosx_version_min 10, 6
+-	.section	__TEXT,__literal16,16byte_literals
+-	.p2align	4
+-LCPI0_0:
+-	.long	1258291200              ## 0x4b000000
+-	.long	1258291200              ## 0x4b000000
+-	.long	1258291200              ## 0x4b000000
+-	.long	1258291200              ## 0x4b000000
+-LCPI0_1:
+-	.long	1392508928              ## 0x53000000
+-	.long	1392508928              ## 0x53000000
+-	.long	1392508928              ## 0x53000000
+-	.long	1392508928              ## 0x53000000
+-LCPI0_2:
+-	.long	3539992704              ## float -5.49764202E+11
+-	.long	3539992704              ## float -5.49764202E+11
+-	.long	3539992704              ## float -5.49764202E+11
+-	.long	3539992704              ## float -5.49764202E+11
+-	.section	__TEXT,__text,regular,pure_instructions
+ 	.globl	_t
+ 	.p2align	4, 0x90
+ _t:                                     ## @t
+ ## BB#0:                                ## %entry
+ 	pushq	%r14
+ 	pushq	%rbx
+ 	pushq	%rax
+ 	movq	%rsi, %r14
+ 	movq	%rdi, %rbx
+ 	orq	$2097152, %r14          ## imm = 0x200000
+ 	andl	$15728640, %r14d        ## imm = 0xF00000
+ 	jmp	LBB0_1
+ 	.p2align	4, 0x90
+ LBB0_3:                                 ## %bb.i
+                                         ##   in Loop: Header=BB0_1 Depth=1
+-	movd	0, %xmm0                ## xmm0 = mem[0],zero,zero,zero
+-	pinsrd	$1, 4, %xmm0
+-	pinsrd	$2, 8, %xmm0
+-	movdqa	%xmm0, %xmm1
+-	pblendw	$170, LCPI0_0(%rip), %xmm1 ## xmm1 = xmm1[0],mem[1],xmm1[2],mem[3],xmm1[4],mem[5],xmm1[6],mem[7]
+-	psrld	$16, %xmm0
+-	pblendw	$170, LCPI0_1(%rip), %xmm0 ## xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+-	addps	LCPI0_2(%rip), %xmm0
+-	addps	%xmm1, %xmm0
++	movl	0, %eax
++	movl	4, %ecx
++	xorps	%xmm0, %xmm0
++	cvtsi2ssq	%rax, %xmm0
++	xorps	%xmm1, %xmm1
++	cvtsi2ssq	%rcx, %xmm1
++	movl	8, %eax
++	xorps	%xmm2, %xmm2
++	cvtsi2ssq	%rax, %xmm2
++	insertps	$16, %xmm1, %xmm0 ## xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
++	insertps	$32, %xmm2, %xmm0 ## xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+ 	movaps	%xmm0, 0
+ LBB0_1:                                 ## %bb4
+                                         ## =>This Inner Loop Header: Depth=1
+ 	xorl	%eax, %eax
+ 	callq	_xxGetOffsetForCode
+ 	xorl	%esi, %esi
+ 	xorl	%eax, %eax
+ 	movq	%rbx, %rdi
+ 	callq	_xxCalculateMidType
+ 	cmpl	$1, %eax
+ 	jne	LBB0_1
+ ## BB#2:                                ## %bb26
+                                         ##   in Loop: Header=BB0_1 Depth=1
+ 	cmpq	$1048576, %r14          ## imm = 0x100000
+ 	jne	LBB0_1