Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -2806,6 +2806,12 @@
     return false;
   }
 
+  /// Return true if it is desirable to speculatively load the operands
+  /// of a select instruction for the target.
+  virtual bool isDesirableToSpeculateSelectLoad() const {
+    return false;
+  }
+
   /// Return true if the target supports swifterror attribute. It optimizes
   /// loads and stores to reading and writing a specific register.
   virtual bool supportSwiftError() const {
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16248,6 +16248,26 @@
                                       LLD->getBasePtr().getValueType()))
       return false;
 
+    // Avoid combining the load if both loads are GEPs into elements of the same
+    // struct. TODO: handle cases where the GEP is bitcasted to another type.
+    if (TLI.isDesirableToSpeculateSelectLoad()) {
+      if (LLD->getMemOperand()->getValue() &&
+          RLD->getMemOperand()->getValue()) {
+        const GetElementPtrInst *GEPTrue =
+            dyn_cast<GetElementPtrInst>(LLD->getMemOperand()->getValue());
+        const GetElementPtrInst *GEPFalse =
+            dyn_cast<GetElementPtrInst>(RLD->getMemOperand()->getValue());
+        if (GEPTrue && GEPFalse) {
+          if (GEPTrue->getSourceElementType()->isStructTy() &&
+              GEPFalse->getSourceElementType()->isStructTy() &&
+              GEPTrue->getPointerOperand() == GEPFalse->getPointerOperand() &&
+              GEPTrue->hasAllConstantIndices() &&
+              GEPFalse->hasAllConstantIndices())
+            return false;
+        }
+      }
+    }
+
     // Check that the select condition doesn't reach either load.  If so,
     // folding this will induce a cycle into the DAG.  If not, this is safe to
     // xform, so create a select of the addresses.
Index: lib/Target/X86/CMakeLists.txt
===================================================================
--- lib/Target/X86/CMakeLists.txt
+++ lib/Target/X86/CMakeLists.txt
@@ -45,6 +45,7 @@
   X86RegisterInfo.cpp
   X86SelectionDAGInfo.cpp
   X86ShuffleDecodeConstantPool.cpp
+  X86SpeculateSelectLoad.cpp
   X86Subtarget.cpp
   X86TargetMachine.cpp
   X86TargetObjectFile.cpp
Index: lib/Target/X86/X86.h
===================================================================
--- lib/Target/X86/X86.h
+++ lib/Target/X86/X86.h
@@ -77,6 +77,10 @@
 /// Windows-specific but architecture-neutral preparation.
 FunctionPass *createX86WinEHStatePass();
 
+/// Return an IR pass that tries to speculatively load the operands of
+/// a select instruction when profitable.
+FunctionPass *createX86SpeculateSelectLoadPass();
+
 /// Return a Machine IR pass that expands X86-specific pseudo
 /// instructions into a sequence of actual instructions. This pass
 /// must run after prologue/epilogue insertion and before lowering
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -805,6 +805,10 @@
     /// and some i16 instructions are slow.
     bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
 
+    /// Return true if it is desirable to speculatively load the operands
+    /// of a select instruction for the target.
+    bool isDesirableToSpeculateSelectLoad() const override { return true; }
+
     MachineBasicBlock *
     EmitInstrWithCustomInserter(MachineInstr &MI,
                                 MachineBasicBlock *MBB) const override;
Index: lib/Target/X86/X86SpeculateSelectLoad.cpp
===================================================================
--- /dev/null
+++ lib/Target/X86/X86SpeculateSelectLoad.cpp
@@ -0,0 +1,148 @@
+//===-- X86SpeculateSelectLoad- Speculatively load operands of a select --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+// For a select instruction where the operands are address calculations
+// of two independent loads, the pass tries to speculate the loads and
+// feed them into the select instruction, this allows early parallel execution
+// of the loads and possibly memory folding into the CMOV instructions later on.
+// The pass currently only handles cases where the loads are elements of the
+// same struct.
+//===---------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86speculateload"
+
+namespace llvm {
+void initializeX86SpeculateSelectLoadPassPass(PassRegistry &);
+}
+
+namespace {
+
+class X86SpeculateSelectLoadPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid.
+
+  X86SpeculateSelectLoadPass() : FunctionPass(ID) {
+    initializeX86SpeculateSelectLoadPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &Fn) override;
+  StringRef getPassName() const override {
+    return "X86 Speculatively load before select instruction";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+private:
+  bool OptimizeSelectInst(SelectInst *SI);
+  const DataLayout *DL;
+  const TargetTransformInfo *TTI;
+  SmallVector<Instruction *, 2> InstrForRemoval;
+};
+}
+
+FunctionPass *llvm::createX86SpeculateSelectLoadPass() {
+  return new X86SpeculateSelectLoadPass();
+}
+
+char X86SpeculateSelectLoadPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86SpeculateSelectLoadPass, "x86-speculateload",
+                      "X86 Speculatively load before select instruction", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(X86SpeculateSelectLoadPass, "x86-speculateload",
+                    "X86 Speculatively load before select instruction", false,
+                    false)
+
+bool X86SpeculateSelectLoadPass::OptimizeSelectInst(SelectInst *SI) {
+  GetElementPtrInst *GEPIT = dyn_cast<GetElementPtrInst>(SI->getTrueValue());
+  GetElementPtrInst *GEPIF = dyn_cast<GetElementPtrInst>(SI->getFalseValue());
+  if (GEPIT == nullptr || GEPIF == nullptr)
+    return false;
+
+  // The pass currently only handles cases where the loads are elements of the
+  // same struct and which aren't aggregate.
+  if (!GEPIT->getSourceElementType()->isStructTy() ||
+      !GEPIF->getSourceElementType()->isStructTy() ||
+      GEPIT->getPointerOperand() != GEPIF->getPointerOperand() ||
+      !GEPIT->hasAllConstantIndices() || !GEPIF->hasAllConstantIndices() ||
+      GEPIT->getNumOperands() != 3 || GEPIF->getNumOperands() != 3)
+    return false;
+
+  if (!SI->hasOneUse())
+    return false;
+  // Bail out if there is a good chance we'll be loading from two different
+  // cache lines instead of one.
+  if (StructType *STy = dyn_cast<StructType>(GEPIT->getSourceElementType())) {
+    // Get the indices of the elements in the struct
+    ConstantInt *Idx1 = dyn_cast<ConstantInt>(GEPIT->getOperand(2));
+    ConstantInt *Idx2 = dyn_cast<ConstantInt>(GEPIF->getOperand(2));
+    const StructLayout *STL = DL->getStructLayout(STy);
+    if (Idx1 && Idx2 && STL) {
+      signed offset1 = STL->getElementOffset(Idx1->getZExtValue());
+      signed offset2 = STL->getElementOffset(Idx2->getZExtValue());
+      unsigned dist = abs(offset1 - offset2);
+      assert((TTI->getCacheLineSize() > 0) &&
+             "CacheLineSize information is missing for X86");
+      if (dist > TTI->getCacheLineSize())
+        return false;
+    }
+  }
+
+  for (User *U : SI->users()) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      if (!LI->isSimple())
+        return false;
+      IRBuilder<> Builder(SI);
+      LoadInst *LT = Builder.CreateAlignedLoad(GEPIT, LI->getAlignment());
+      LoadInst *LF = Builder.CreateAlignedLoad(GEPIF, LI->getAlignment());
+      Value *NewSI = Builder.CreateSelect(SI->getCondition(), LT, LF);
+      LI->replaceAllUsesWith(NewSI);
+      InstrForRemoval.push_back(LI);
+      InstrForRemoval.push_back(SI);
+
+      return true;
+    }
+  }
+  return false;
+}
+
+bool X86SpeculateSelectLoadPass::runOnFunction(Function &F) {
+  if (skipFunction(F) || F.optForSize())
+    return false;
+
+  DL = &F.getParent()->getDataLayout();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  bool Changed = false;
+
+  for (auto &BB : F)
+    for (auto &I : BB)
+      if (SelectInst *SI = dyn_cast<SelectInst>(&I))
+        Changed |= OptimizeSelectInst(SI);
+
+  for (auto Instr : InstrForRemoval)
+    Instr->eraseFromParent();
+
+  InstrForRemoval.clear();
+
+  return Changed;
+}
+
Index: lib/Target/X86/X86TargetMachine.cpp
===================================================================
--- lib/Target/X86/X86TargetMachine.cpp
+++ lib/Target/X86/X86TargetMachine.cpp
@@ -59,6 +59,7 @@
 void initializeWinEHStatePassPass(PassRegistry &);
 void initializeFixupLEAPassPass(PassRegistry &);
 void initializeX86ExecutionDepsFixPass(PassRegistry &);
+void initializeX86SpeculateSelectLoadPassPass(PassRegistry &);
 
 } // end namespace llvm
 
@@ -74,6 +75,7 @@
   initializeEvexToVexInstPassPass(PR);
   initializeFixupLEAPassPass(PR);
   initializeX86ExecutionDepsFixPass(PR);
+  initializeX86SpeculateSelectLoadPassPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -342,8 +344,10 @@
 
   TargetPassConfig::addIRPasses();
 
-  if (TM->getOptLevel() != CodeGenOpt::None)
+  if (TM->getOptLevel() != CodeGenOpt::None) {
     addPass(createInterleavedAccessPass());
+    addPass(createX86SpeculateSelectLoadPass());
+  }
 }
 
 bool X86PassConfig::addInstSelector() {
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -115,6 +115,8 @@
                            const Function *Callee) const;
   bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
   bool enableInterleavedAccessVectorization();
+  unsigned getCacheLineSize() const;
+
 private:
   int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
                       unsigned Alignment, unsigned AddressSpace);
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2529,3 +2529,5 @@
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace);
 }
+
+unsigned X86TTIImpl::getCacheLineSize() const { return 64; }
Index: test/CodeGen/X86/speculate-select-load.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/speculate-select-load.ll
@@ -0,0 +1,70 @@
+; RUN: opt < %s -mcpu=x86-64 -S -x86-speculateload  | FileCheck %s
+; RUN: opt < %s -mcpu=i386 -S -x86-speculateload  | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.S = type { i64, %struct.S*, %struct.S*, i64, i16, i64, i64, i64, i64, i64 }
+
+;; Selecting between pointers of two members of the same structure with offset
+;; smaller than a cache line's size (64 bytes) between them, load speculatively.
+; Function Attrs: norecurse nounwind readonly uwtable
+define %struct.S* @spec_load(i32 %x, %struct.S* nocapture readnone %A, %struct.S* nocapture readonly %B) local_unnamed_addr #0 {
+entry:
+; CHECK-LABEL:@spec_load
+; CHECK: getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 1
+; CHECK: getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 2
+; CHECK: [[A:%[0-9]+]] = load %struct.S*, %struct.S** %{{.*}}, align 8
+; CHECK: [[B:%[0-9]+]] = load %struct.S*, %struct.S** %{{.*}}, align 8
+; CHECK: select i1 %tobool, %struct.S* [[A]], %struct.S* [[B]]
+
+  %tobool = icmp eq i32 %x, 0
+  %b = getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 1
+  %c = getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 2
+  %A.addr.0.in = select i1 %tobool, %struct.S** %c, %struct.S** %b
+  %A.addr.0 = load %struct.S*, %struct.S** %A.addr.0.in, align 8
+  ret %struct.S* %A.addr.0
+}
+
+;; Selecting between pointers of two members of the same structure with offset
+;; greater than a cache line's size (64 bytes) between them, thus do not load speculatively.
+; Function Attrs: norecurse nounwind readonly uwtable
+define i64 @no_spec_load(i32 %x, i64 %A, %struct.S* nocapture readonly %B) local_unnamed_addr #0 {
+entry:
+; CHECK-LABEL:@no_spec_load
+; CHECK: [[A:%[a-z]+]] = getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 0
+; CHECK: [[B:%[a-z]+]] = getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 9
+; CHECK: select i1 %tobool, i64* [[B]], i64* [[A]]
+; CHECK-NEXT: load i64, i64* %{{.*}}, align 8
+
+  %tobool = icmp eq i32 %x, 0
+  %a = getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 0
+  %j = getelementptr inbounds %struct.S, %struct.S* %B, i64 0, i32 9
+  %A.addr.0.in = select i1 %tobool, i64* %j, i64* %a
+  %A.addr.0 = load i64, i64* %A.addr.0.in, align 8
+  ret i64 %A.addr.0
+}
+
+;; Selecting into an aggregate type of the struct, do not load speculatively
+%struct.S2 = type { i32, [10 x i32] }
+
+; Function Attrs: norecurse nounwind readonly uwtable
+define i32 @no_load_agg(%struct.S2* nocapture readonly %s1, %struct.S2* nocapture readnone %s2, i32 %x) local_unnamed_addr #0 {
+entry:
+; CHECK-LABEL:@no_load_agg
+; CHECK: [[A:%[a-z]+]] = getelementptr inbounds %struct.S2, %struct.S2* %s1, i64 0, i32 0
+; CHECK: [[B:%[a-z]+]] = getelementptr inbounds %struct.S2, %struct.S2* %s1, i64 0, i32 1, i64 10
+; CHECK: %retval.0.in = select i1 %tobool, i32* [[B]], i32* [[A]]
+
+  %tobool = icmp eq i32 %x, 0
+  %a = getelementptr inbounds %struct.S2, %struct.S2* %s1, i64 0, i32 0
+  %arrayidx = getelementptr inbounds %struct.S2, %struct.S2* %s1, i64 0, i32 1, i64 10
+  %retval.0.in = select i1 %tobool, i32* %arrayidx, i32* %a
+  %retval.0 = load i32, i32* %retval.0.in, align 4
+  ret i32 %retval.0
+}
+
+
+
+attributes #0 = { norecurse nounwind readonly uwtable "target-cpu"="core-avx2" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"}
+