Index: lib/Transforms/InstCombine/InstCombineVectorOps.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -69,8 +69,13 @@
   if (I->getOpcode() == Instruction::InsertElement && isConstant &&
       isa<ConstantInt>(I->getOperand(2)))
     return true;
-  if (I->getOpcode() == Instruction::Load && I->hasOneUse())
+  if (I->hasOneUse()) {
+    if (I->getOpcode() == Instruction::Load ||
+        // FIXME: Support other types?
+        (isConstant && isa<ShuffleVectorInst>(I) &&
+         V->getType()->getScalarSizeInBits() == 16))
     return true;
+  }
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I))
     if (BO->hasOneUse() &&
         (cheapToScalarize(BO->getOperand(0), isConstant) ||
Index: test/Transforms/InstCombine/fast-math-scalarization.ll
===================================================================
--- test/Transforms/InstCombine/fast-math-scalarization.ll
+++ test/Transforms/InstCombine/fast-math-scalarization.ll
@@ -28,7 +28,7 @@
   ret void
 }
 
-; CHECK-LABEL: test_extract_element_fastmath
+; CHECK-LABEL: @test_extract_element_fastmath
 ; CHECK: fadd fast float
 define float @test_extract_element_fastmath(<4 x float> %x) #0 {
 entry:
@@ -37,3 +37,77 @@
   ret float %0
 }
 
+; CHECK-LABEL: @scalarize_shuffles_half2
+; CHECK: extractelement
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: fadd fast half
+; CHECK-NEXT: ret
+define half @scalarize_shuffles_half2(<2 x half> %x, <2 x half> %y) {
+entry:
+  %shuff1 = shufflevector <2 x half> %x, <2 x half> %y, <2 x i32> <i32 2, i32 1>
+  %shuff2 = shufflevector <2 x half> %y, <2 x half> %x, <2 x i32> <i32 2, i32 1>
+  %vadd = fadd fast <2 x half> %shuff1, %shuff2
+  %scalar = extractelement <2 x half> %vadd, i32 0
+  ret half %scalar
+}
+
+; CHECK-LABEL: @scalarize_shuffles_2i16
+; CHECK: extractelement
+; CHECK-NEXT: extractelement
+; CHECK: add i16
+; CHECK-NEXT: ret
+define i16 @scalarize_shuffles_2i16(<2 x i16> %x, <2 x i16> %y) {
+entry:
+  %shuff1 = shufflevector <2 x i16> %x, <2 x i16> %y, <2 x i32> <i32 2, i32 1>
+  %shuff2 = shufflevector <2 x i16> %y, <2 x i16> %x, <2 x i32> <i32 2, i32 1>
+  %vadd = add <2 x i16> %shuff1, %shuff2
+  %scalar = extractelement <2 x i16> %vadd, i32 0
+  ret i16 %scalar
+}
+
+; CHECK-LABEL: @scalarize_multiplebinops
+; CHECK: extractelement
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: fadd fast half
+; CHECK-NEXT: fadd fast half
+; CHECK-NEXT: ret
+define half @scalarize_multiplebinops(<2 x half> %x, <2 x half> %y) {
+entry:
+  %x.1 = shufflevector <2 x half> %x, <2 x half> undef, <2 x i32> <i32 1, i32 undef>
+  %shuff1 = shufflevector <2 x half> %x, <2 x half> %y, <2 x i32> <i32 2, i32 1>
+  %shuff2 = shufflevector <2 x half> %y, <2 x half> %x, <2 x i32> <i32 2, i32 1>
+  %vadd1 = fadd fast <2 x half> %shuff1, %shuff2
+  %vadd2 = fadd fast <2 x half> %x.1, %vadd1
+  %scalar = extractelement <2 x half> %vadd2, i32 0
+  ret half %scalar
+}
+
+; CHECK-LABEL: scalarize_reduction_half2
+; CHECK: extractelement
+; CHECK-NEXT: extractelement
+; CHECK: fadd fast half
+; CHECK: ret
+define half @scalarize_reduction_half2(<2 x half> %x) {
+entry:
+  %rdx.shuf1 = shufflevector <2 x half> %x, <2 x half> undef, <2 x i32> <i32 1, i32 undef>
+  %add = fadd fast <2 x half> %x, %rdx.shuf1
+  %scalar = extractelement <2 x half> %add, i32 0
+  ret half %scalar
+}
+
+; CHECK-LABEL: @scalarize_reduction_float2
+; TODO: The following test should be scalarized also. cheapToScalarize() in InstCombineVectorOps.cpp is
+; restricted to type half only. This is because allowing it for other types might cause performance regression
+; on other targets(such as X86). When there is performance regression this is an artifact of pattern matching being
+; too restrictive, and should be fixed.
+; For example, X86(isHorizontalBinOp) relies on shuffle pattern to
+; detect horizontal binary operations. Absense of the shuffle causes isHorizontalBinOp to not match the horizontalbinop
+; pattern and produces vmovshdup+vaddss instead of vhaddps on avx2;
+define float @scalarize_reduction_float2(<2 x float> %x) {
+entry:
+  %rdx.shuf1 = shufflevector <2 x float> %x, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
+  %add = fadd fast <2 x float> %x, %rdx.shuf1
+  %scalar = extractelement <2 x float> %add, i32 0
+  ret float %scalar
+}