Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -30,6 +31,7 @@
 
 #define DEBUG_TYPE "vector-combine"
 STATISTIC(NumVecCmp, "Number of vector compares formed");
+STATISTIC(NumVecBO, "Number of vector binops formed");
 
 static bool foldExtractCmp(Instruction &I, const TargetTransformInfo &TTI) {
   // Match a cmp with extracted vector operands.
@@ -76,6 +78,65 @@
   return true;
 }
 
+/// Try to reduce extract element costs by converting scalar binops to vector
+/// binops followed by extract.
+static bool foldExtractBinop(Instruction &I, const TargetTransformInfo &TTI) {
+  // It is not safe to transform things like div, urem, etc. because we may
+  // create undefined behavior when executing those on unknown vector elements.
+  if (!isSafeToSpeculativelyExecute(&I))
+    return false;
+
+  // Match a scalar binop with extracted vector operands:
+  // bo (extelt X, C0), (extelt Y, C1)
+  Instruction *Ext0, *Ext1;
+  if (!match(&I, m_BinOp(m_Instruction(Ext0), m_Instruction(Ext1))))
+    return false;
+
+  // TODO: Relax the one-use constraints by adjusting the cost calc.
+  Value *X, *Y;
+  uint64_t C0, C1;
+  if (!match(Ext0, m_OneUse(m_ExtractElement(m_Value(X), m_ConstantInt(C0)))) ||
+      !match(Ext1, m_OneUse(m_ExtractElement(m_Value(Y), m_ConstantInt(C1)))))
+    return false;
+
+  // Check if using a vector binop would be cheaper.
+  Instruction::BinaryOps BOpcode = cast<BinaryOperator>(I).getOpcode();
+  Type *ScalarTy = I.getType();
+  Type *VecTy = X->getType();
+  int ScalarBOCost = TTI.getArithmeticInstrCost(BOpcode, ScalarTy);
+  int VecBOCost = TTI.getArithmeticInstrCost(BOpcode, VecTy);
+  int Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement,
+                                            VecTy, C0);
+  int Extract1Cost = TTI.getVectorInstrCost(Instruction::ExtractElement,
+                                            VecTy, C1);
+
+  // Handle a special case - if the extract indexes are the same, the
+  // replacement sequence does not require a shuffle. Unless the vector binop is
+  // much more expensive than the scalar binop, this eliminates an extract.
+  if (C0 == C1) {
+    int ScalarCost = Extract0Cost + Extract1Cost + ScalarBOCost;
+    int VecCost = VecBOCost + Extract0Cost;
+    if (ScalarCost <= VecCost)
+      return false;
+
+    // bo (extelt X, C), (extelt Y, C) --> extelt (bo X, Y), C
+    ++NumVecBO;
+    IRBuilder<> Builder(&I);
+    Value *NewBO = Builder.CreateBinOp(BOpcode, X, Y);
+    if (auto *VecBOInst = dyn_cast<Instruction>(NewBO)) {
+      // All IR flags are safe to back-propagate because any potential poison
+      // created in unused vector elements is discarded by the extract.
+      VecBOInst->copyIRFlags(&I);
+    }
+    Value *Extract = Builder.CreateExtractElement(NewBO, Ext0->getOperand(1));
+    I.replaceAllUsesWith(Extract);
+    return true;
+  }
+
+  // TODO: Handle C0 != C1 by shuffling 1 of the operands.
+  return false;
+}
+
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 static bool runImpl(Function &F, const TargetTransformInfo &TTI,
@@ -92,7 +153,7 @@
     //       iteratively in this loop rather than waiting until the end.
     for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
       MadeChange |= foldExtractCmp(I, TTI);
-      // TODO: More transforms go here.
+      MadeChange |= foldExtractBinop(I, TTI);
     }
   }
 
Index: llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
===================================================================
--- llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -1,12 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- | FileCheck %s
 
+; Eliminating extract is profitable.
+
 define i8 @ext0_ext0_add(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: @ext0_ext0_add(
-; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
-; CHECK-NEXT:    [[R:%.*]] = add i8 [[E0]], [[E1]]
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i8> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    ret i8 [[TMP2]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 0
   %e1 = extractelement <16 x i8> %y, i32 0
@@ -14,12 +15,13 @@
   ret i8 %r
 }
 
+; Eliminating extract is still profitable. Flags propagate.
+
 define i8 @ext1_ext1_add_flags(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: @ext1_ext1_add_flags(
-; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i8 [[E0]], [[E1]]
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <16 x i8> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
+; CHECK-NEXT:    ret i8 [[TMP2]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 1
   %e1 = extractelement <16 x i8> %y, i32 1
@@ -27,6 +29,8 @@
   ret i8 %r
 }
 
+; Negative test - eliminating extract is profitable, but vector shift is expensive.
+
 define i8 @ext1_ext1_shl(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: @ext1_ext1_shl(
 ; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
@@ -40,6 +44,8 @@
   ret i8 %r
 }
 
+; Negative test - eliminating extract is profitable, but vector multiply is expensive.
+
 define i8 @ext13_ext13_mul(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: @ext13_ext13_mul(
 ; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 13
@@ -53,6 +59,8 @@
   ret i8 %r
 }
 
+; Negative test - cost is irrelevant because sdiv has potential UB.
+
 define i8 @ext0_ext0_sdiv(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: @ext0_ext0_sdiv(
 ; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
@@ -66,6 +74,8 @@
   ret i8 %r
 }
 
+; Negative test - extracts are free and vector op has same cost as scalar.
+
 define double @ext0_ext0_fadd(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: @ext0_ext0_fadd(
 ; CHECK-NEXT:    [[E0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
@@ -79,12 +89,13 @@
   ret double %r
 }
 
+; Eliminating extract is profitable. Flags propagate.
+
 define double @ext1_ext1_fsub(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: @ext1_ext1_fsub(
-; CHECK-NEXT:    [[E0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fsub fast double [[E0]], [[E1]]
-; CHECK-NEXT:    ret double [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub fast <2 x double> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    ret double [[TMP2]]
 ;
   %e0 = extractelement <2 x double> %x, i32 1
   %e1 = extractelement <2 x double> %y, i32 1
@@ -92,6 +103,8 @@
   ret double %r
 }
 
+; TODO: Different extract indexes requires a shuffle.
+
 define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: @ext0_ext1_add(
 ; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0