Index: lib/Analysis/InlineCost.cpp
===================================================================
--- lib/Analysis/InlineCost.cpp
+++ lib/Analysis/InlineCost.cpp
@@ -142,6 +142,14 @@
   /// Keep track of values which map to a pointer base and constant offset.
   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
 
+  /// After SROA finds Stores that can be simplified, the identical loads
+  /// clobbered by these Stores can also be removed.  The simple CSE analysis
+  /// only supports single block callee and quits if there are Stores cannot be
+  /// simplified or function calls in the callee.
+  bool EnableCSELoad;
+  SmallPtrSet<Value *, 12> LoadAddrSet;
+  int CSELoadCost;
+
   // Custom simplification helper routines.
   bool isAllocaDerivedArg(Value *V);
   bool lookupSROAArgAndCost(Value *V, Value *&Arg,
@@ -150,6 +158,7 @@
   void disableSROA(Value *V);
   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
                           int InstructionCost);
+  void disableCSELoad();
   bool isGEPFree(GetElementPtrInst &GEP);
   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
   bool simplifyCallSite(Function *F, CallSite CS);
@@ -232,10 +241,11 @@
         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
         HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
         NumVectorInstructions(0), FiftyPercentVectorBonus(0),
-        TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
-        NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
-        NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
-        SROACostSavings(0), SROACostSavingsLost(0) {}
+        TenPercentVectorBonus(0), VectorBonus(0), EnableCSELoad(false),
+        CSELoadCost(0), NumConstantArgs(0), NumConstantOffsetPtrArgs(0),
+        NumAllocaArgs(0), NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
+        NumInstructionsSimplified(0), SROACostSavings(0),
+        SROACostSavingsLost(0), CSELoadCostSavings(0) {}
 
   bool analyzeCall(CallSite CS);
 
@@ -252,6 +262,7 @@
   unsigned NumInstructionsSimplified;
   unsigned SROACostSavings;
   unsigned SROACostSavingsLost;
+  unsigned CSELoadCostSavings;
 
   void dump();
 };
@@ -290,6 +301,7 @@
   SROACostSavings -= CostIt->second;
   SROACostSavingsLost += CostIt->second;
   SROAArgCosts.erase(CostIt);
+  disableCSELoad();
 }
 
 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
@@ -307,6 +319,12 @@
   SROACostSavings += InstructionCost;
 }
 
+void CallAnalyzer::disableCSELoad() {
+  Cost += CSELoadCost;
+  CSELoadCostSavings = 0;
+  EnableCSELoad = false;
+}
+
 /// \brief Accumulate a constant GEP offset into an APInt if possible.
 ///
 /// Returns false if unable to compute the offset for any reason. Respects any
@@ -821,6 +839,14 @@
     disableSROA(CostIt);
   }
 
+  if(EnableCSELoad) {
+    if (!LoadAddrSet.insert(I.getPointerOperand()).second) {
+      CSELoadCost += InlineConstants::InstrCost;
+      CSELoadCostSavings += InlineConstants::InstrCost;
+      return true;
+    }
+  }
+
   return false;
 }
 
@@ -835,7 +861,8 @@
 
     disableSROA(CostIt);
   }
-
+  if (EnableCSELoad)
+    disableCSELoad();
   return false;
 }
 
@@ -899,6 +926,8 @@
 }
 
 bool CallAnalyzer::visitCallSite(CallSite CS) {
+  if (EnableCSELoad)
+    disableCSELoad();
   if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
     // This aborts the entire analysis.
@@ -1355,6 +1384,9 @@
   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
   NumAllocaArgs = SROAArgValues.size();
 
+  // Only set EnableCSELoad true when there are SROA opportunities.
+  EnableCSELoad = (NumAllocaArgs > 0);
+
   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
   // the ephemeral values multiple times (and they're completely determined by
   // the callee, so this is purely duplicate work).
@@ -1434,6 +1466,8 @@
       // Take off the bonus we applied to the threshold.
       Threshold -= SingleBBBonus;
       SingleBB = false;
+      if (EnableCSELoad)
+        disableCSELoad();
     }
   }
 
@@ -1467,6 +1501,7 @@
   DEBUG_PRINT_STAT(NumInstructions);
   DEBUG_PRINT_STAT(SROACostSavings);
   DEBUG_PRINT_STAT(SROACostSavingsLost);
+  DEBUG_PRINT_STAT(CSELoadCostSavings);
   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
   DEBUG_PRINT_STAT(Cost);
   DEBUG_PRINT_STAT(Threshold);
Index: test/Transforms/Inline/redundant-loads.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/redundant-loads.ll
@@ -0,0 +1,200 @@
+; RUN: opt -inline < %s -S -o - -inline-threshold=225 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.matrix = type { [3 x [3 x double]] }
+
+define void @outer(%struct.matrix* %a, %struct.matrix* %b) {
+; CHECK-LABEL: @outer(
+; CHECK-NOT: call void @matrix_multiply
+  %c = alloca %struct.matrix
+  call void @matrix_multiply(%struct.matrix* %a, %struct.matrix* %b, %struct.matrix* %c)
+  ret void
+}
+
+; Every data in matrix a and b are loaded three times.
+; typedef struct { double m[3][3]; } matrix;
+; void matrix_multiply( matrix *a, matrix *b, matrix *c ){
+;   int i,j,k;
+;     for(i=0; i<3; i++)
+;       for(j=0; j<3; j++)
+;          for(k=0; k<3; k++)
+;            c->m[i][j] += a->m[i][k] * b->m[k][j];
+; }
+
+define void @matrix_multiply(%struct.matrix* %a, %struct.matrix* %b, %struct.matrix* %c) {
+entry:
+  %arrayidx8 = getelementptr inbounds %struct.matrix, %struct.matrix* %a, i64 0, i32 0, i64 0, i64 0
+  %arrayidx18 = getelementptr inbounds %struct.matrix, %struct.matrix* %c, i64 0, i32 0, i64 0, i64 0
+  %0 = load double, double* %arrayidx8
+  %arrayidx13 = getelementptr inbounds %struct.matrix, %struct.matrix* %b, i64 0, i32 0, i64 0, i64 0
+  %1 = load double, double* %arrayidx13
+  %mul = fmul double %0, %1
+  %2 = load double, double* %arrayidx18
+  %add = fadd double %2, %mul
+  store double %add, double* %arrayidx18
+  %arrayidx8.1 = getelementptr inbounds %struct.matrix, %struct.matrix* %a, i64 0, i32 0, i64 0, i64 1
+  %3 = load double, double* %arrayidx8.1
+  %arrayidx13.1 = getelementptr inbounds %struct.matrix, %struct.matrix* %b, i64 0, i32 0, i64 1, i64 0
+  %4 = load double, double* %arrayidx13.1
+  %mul.1 = fmul double %3, %4
+  %add.1 = fadd double %add, %mul.1
+  store double %add.1, double* %arrayidx18
+  %arrayidx8.2 = getelementptr inbounds %struct.matrix, %struct.matrix* %a, i64 0, i32 0, i64 0, i64 2
+  %5 = load double, double* %arrayidx8.2
+  %arrayidx13.2 = getelementptr inbounds %struct.matrix, %struct.matrix* %b, i64 0, i32 0, i64 2, i64 0
+  %6 = load double, double* %arrayidx13.2
+  %mul.2 = fmul double %5, %6
+  %add.2 = fadd double %add.1, %mul.2
+  store double %add.2, double* %arrayidx18
+  %arrayidx18.1 = getelementptr inbounds %struct.matrix, %struct.matrix* %c, i64 0, i32 0, i64 0, i64 1
+  %7 = load double, double* %arrayidx8
+  %arrayidx13.140 = getelementptr inbounds %struct.matrix, %struct.matrix* %b, i64 0, i32 0, i64 0, i64 1
+  %8 = load double, double* %arrayidx13.140
+  %mul.141 = fmul double %7, %8
+  %9 = load double, double* %arrayidx18.1
+  %add.142 = fadd double %9, %mul.141
+  store double %add.142, double* %arrayidx18.1
+  %10 = load double, double* %arrayidx8.1
+  %arrayidx13.1.1 = getelementptr inbounds %struct.matrix, %struct.matrix* %b, i64 0, i32 0, i64 1, i64 1
+  %11 = load double, double* %arrayidx13.1.1
+  %mul.1.1 = fmul double %10, %11
+  %add.1.1 = fadd double %add.142, %mul.1.1
+  store double %add.1.1, double* %arrayidx18.1
+  %12 = load double, double* %arrayidx8.2
+  %arrayidx13.2.1 = getelementptr inbounds %struct.matrix, %struct.matrix* %b, i64 0, i32 0, i64 2, i64 1
+  %13 = load double, double* %arrayidx13.2.1
+  %mul.2.1 = fmul double %12, %13
+  %add.2.1 = fadd double %add.1.1, %mul.2.1
+  store double %add.2.1, double* %arrayidx18.1
+  %arrayidx18.2 = getelementptr inbounds %struct.matrix, %struct.matrix* %c, i64 0, i32 0, i64 0, i64 2
+  %14 = load double, double* %arrayidx8
+  %arrayidx13.243 = getelementptr inbounds %struct.matrix, %struct.matrix* %b, i64 0, i32 0, i64 0, i64 2
+  %15 = load double, double* %arrayidx13.243
+  %mul.244 = fmul double %14, %15
+  %16 = load double, double* %arrayidx18.2
+  %add.245 = fadd double %16, %mul.244
+  store double %add.245, double* %arrayidx18.2
+  %17 = load double, double* %arrayidx8.1
+  %arrayidx13.1.2 = getelementptr inbounds %struct.matrix, %struct.matrix* %b, i64 0, i32 0, i64 1, i64 2
+  %18 = load double, double* %arrayidx13.1.2
+  %mul.1.2 = fmul double %17, %18
+  %add.1.2 = fadd double %add.245, %mul.1.2
+  store double %add.1.2, double* %arrayidx18.2
+  %19 = load double, double* %arrayidx8.2
+  %arrayidx13.2.2 = getelementptr inbounds %struct.matrix, %struct.matrix* %b, i64 0, i32 0, i64 2, i64 2
+  %20 = load double, double* %arrayidx13.2.2
+  %mul.2.2 = fmul double %19, %20
+  %add.2.2 = fadd double %add.1.2, %mul.2.2
+  store double %add.2.2, double* %arrayidx18.2
+  %arrayidx8.146 = getelementptr inbounds %struct.matrix, %struct.matrix* %a, i64 0, i32 0, i64 1, i64 0
+  %arrayidx18.147 = getelementptr inbounds %struct.matrix, %struct.matrix* %c, i64 0, i32 0, i64 1, i64 0
+  %21 = load double, double* %arrayidx8.146
+  %22 = load double, double* %arrayidx13
+  %mul.149 = fmul double %21, %22
+  %23 = load double, double* %arrayidx18.147
+  %add.150 = fadd double %23, %mul.149
+  store double %add.150, double* %arrayidx18.147
+  %arrayidx8.1.151 = getelementptr inbounds %struct.matrix, %struct.matrix* %a, i64 0, i32 0, i64 1, i64 1
+  %24 = load double, double* %arrayidx8.1.151
+  %25 = load double, double* %arrayidx13.1
+  %mul.1.153 = fmul double %24, %25
+  %add.1.154 = fadd double %add.150, %mul.1.153
+  store double %add.1.154, double* %arrayidx18.147
+  %arrayidx8.2.155 = getelementptr inbounds %struct.matrix, %struct.matrix* %a, i64 0, i32 0, i64 1, i64 2
+  %26 = load double, double* %arrayidx8.2.155
+  %27 = load double, double* %arrayidx13.2
+  %mul.2.157 = fmul double %26, %27
+  %add.2.158 = fadd double %add.1.154, %mul.2.157
+  store double %add.2.158, double* %arrayidx18.147
+  %arrayidx18.1.1 = getelementptr inbounds %struct.matrix, %struct.matrix* %c, i64 0, i32 0, i64 1, i64 1
+  %28 = load double, double* %arrayidx8.146
+  %29 = load double, double* %arrayidx13.140
+  %mul.141.1 = fmul double %28, %29
+  %30 = load double, double* %arrayidx18.1.1
+  %add.142.1 = fadd double %30, %mul.141.1
+  store double %add.142.1, double* %arrayidx18.1.1
+  %31 = load double, double* %arrayidx8.1.151
+  %32 = load double, double* %arrayidx13.1.1
+  %mul.1.1.1 = fmul double %31, %32
+  %add.1.1.1 = fadd double %add.142.1, %mul.1.1.1
+  store double %add.1.1.1, double* %arrayidx18.1.1
+  %33 = load double, double* %arrayidx8.2.155
+  %34 = load double, double* %arrayidx13.2.1
+  %mul.2.1.1 = fmul double %33, %34
+  %add.2.1.1 = fadd double %add.1.1.1, %mul.2.1.1
+  store double %add.2.1.1, double* %arrayidx18.1.1
+  %arrayidx18.2.1 = getelementptr inbounds %struct.matrix, %struct.matrix* %c, i64 0, i32 0, i64 1, i64 2
+  %35 = load double, double* %arrayidx8.146
+  %36 = load double, double* %arrayidx13.243
+  %mul.244.1 = fmul double %35, %36
+  %37 = load double, double* %arrayidx18.2.1
+  %add.245.1 = fadd double %37, %mul.244.1
+  store double %add.245.1, double* %arrayidx18.2.1
+  %38 = load double, double* %arrayidx8.1.151
+  %39 = load double, double* %arrayidx13.1.2
+  %mul.1.2.1 = fmul double %38, %39
+  %add.1.2.1 = fadd double %add.245.1, %mul.1.2.1
+  store double %add.1.2.1, double* %arrayidx18.2.1
+  %40 = load double, double* %arrayidx8.2.155
+  %41 = load double, double* %arrayidx13.2.2
+  %mul.2.2.1 = fmul double %40, %41
+  %add.2.2.1 = fadd double %add.1.2.1, %mul.2.2.1
+  store double %add.2.2.1, double* %arrayidx18.2.1
+  %arrayidx8.260 = getelementptr inbounds %struct.matrix, %struct.matrix* %a, i64 0, i32 0, i64 2, i64 0
+  %arrayidx18.261 = getelementptr inbounds %struct.matrix, %struct.matrix* %c, i64 0, i32 0, i64 2, i64 0
+  %42 = load double, double* %arrayidx8.260
+  %43 = load double, double* %arrayidx13
+  %mul.263 = fmul double %42, %43
+  %44 = load double, double* %arrayidx18.261
+  %add.264 = fadd double %44, %mul.263
+  store double %add.264, double* %arrayidx18.261
+  %arrayidx8.1.265 = getelementptr inbounds %struct.matrix, %struct.matrix* %a, i64 0, i32 0, i64 2, i64 1
+  %45 = load double, double* %arrayidx8.1.265
+  %46 = load double, double* %arrayidx13.1
+  %mul.1.267 = fmul double %45, %46
+  %add.1.268 = fadd double %add.264, %mul.1.267
+  store double %add.1.268, double* %arrayidx18.261
+  %arrayidx8.2.269 = getelementptr inbounds %struct.matrix, %struct.matrix* %a, i64 0, i32 0, i64 2, i64 2
+  %47 = load double, double* %arrayidx8.2.269
+  %48 = load double, double* %arrayidx13.2
+  %mul.2.271 = fmul double %47, %48
+  %add.2.272 = fadd double %add.1.268, %mul.2.271
+  store double %add.2.272, double* %arrayidx18.261
+  %arrayidx18.1.2 = getelementptr inbounds %struct.matrix, %struct.matrix* %c, i64 0, i32 0, i64 2, i64 1
+  %49 = load double, double* %arrayidx8.260
+  %50 = load double, double* %arrayidx13.140
+  %mul.141.2 = fmul double %49, %50
+  %51 = load double, double* %arrayidx18.1.2
+  %add.142.2 = fadd double %51, %mul.141.2
+  store double %add.142.2, double* %arrayidx18.1.2
+  %52 = load double, double* %arrayidx8.1.265
+  %53 = load double, double* %arrayidx13.1.1
+  %mul.1.1.2 = fmul double %52, %53
+  %add.1.1.2 = fadd double %add.142.2, %mul.1.1.2
+  store double %add.1.1.2, double* %arrayidx18.1.2
+  %54 = load double, double* %arrayidx8.2.269
+  %55 = load double, double* %arrayidx13.2.1
+  %mul.2.1.2 = fmul double %54, %55
+  %add.2.1.2 = fadd double %add.1.1.2, %mul.2.1.2
+  store double %add.2.1.2, double* %arrayidx18.1.2
+  %arrayidx18.2.2 = getelementptr inbounds %struct.matrix, %struct.matrix* %c, i64 0, i32 0, i64 2, i64 2
+  %56 = load double, double* %arrayidx8.260
+  %57 = load double, double* %arrayidx13.243
+  %mul.244.2 = fmul double %56, %57
+  %58 = load double, double* %arrayidx18.2.2
+  %add.245.2 = fadd double %58, %mul.244.2
+  store double %add.245.2, double* %arrayidx18.2.2
+  %59 = load double, double* %arrayidx8.1.265
+  %60 = load double, double* %arrayidx13.1.2
+  %mul.1.2.2 = fmul double %59, %60
+  %add.1.2.2 = fadd double %add.245.2, %mul.1.2.2
+  store double %add.1.2.2, double* %arrayidx18.2.2
+  %61 = load double, double* %arrayidx8.2.269
+  %62 = load double, double* %arrayidx13.2.2
+  %mul.2.2.2 = fmul double %61, %62
+  %add.2.2.2 = fadd double %add.1.2.2, %mul.2.2.2
+  store double %add.2.2.2, double* %arrayidx18.2.2
+  ret void
+}