diff --git a/llvm/include/llvm/Analysis/MustExecute.h b/llvm/include/llvm/Analysis/MustExecute.h
--- a/llvm/include/llvm/Analysis/MustExecute.h
+++ b/llvm/include/llvm/Analysis/MustExecute.h
@@ -7,10 +7,17 @@
 //===----------------------------------------------------------------------===//
 /// \file
 /// Contains a collection of routines for determining if a given instruction is
-/// guaranteed to execute if a given point in control flow is reached.  The most
+/// guaranteed to execute if a given point in control flow is reached. The most
 /// common example is an instruction within a loop being provably executed if we
 /// branch to the header of it's containing loop.
 ///
+/// There are two interfaces available to determine if an instruction is
+/// executed once a given point in the control flow is reached:
+/// 1) A loop-centric one derived from LoopSafetyInfo.
+/// 2) A "must be executed context"-based one implemented in the
+///    MustBeExecutedContextExplorer.
+/// Please refer to the class comments for more information.
+///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_ANALYSIS_MUSTEXECUTE_H
@@ -211,6 +218,243 @@
   virtual ~ICFLoopSafetyInfo() {};
 };
 
-}
+/// Forward declaration.
+struct MustBeExecutedContextExplorer;
+
+/// Must be executed iterators visit stretches of instructions that are
+/// guaranteed to be executed together, potentially with other instruction
+/// executed in-between.
+///
+/// Given the following code, and assuming all statements are single
+/// instructions which transfer execution to the successor (see
+/// isGuaranteedToTransferExecutionToSuccessor), there are two possible
+/// outcomes. If we start the iterator at A, B, or E, we will visit only A, B,
+/// (and E). If we start at C or D, we will visit all instructions A-E.
+///
+/// \code
+///   A;
+///   B;
+///   if (...) {
+///     C;
+///     D;
+///   }
+///   E;
+/// \endcode
+///
+///
+/// Below is the example extneded with instructions F and G. Now we assume F
+/// might not transfer execution to it's successor G. As a result we get the
+/// following visit sets:
+///
+/// Start Instruction   | Visit Set
+/// A                   | A, B,
+///    B,               |    B,
+///       C             |       C, D, E, F
+///          D          |          D, E, F
+///             E       |             E, F
+///                F    |                F
+///                   G |                   G
+///
+///
+/// \code
+///   A;
+///   B;
+///   if (...) {
+///     C;
+///     D;
+///   }
+///   E;
+///   F;  // Might not transfer execution to its successor G.
+///   G;
+/// \endcode
+///
+///
+/// Note that the examples show optimal visist sets but not necessarily the ones
+/// derived by the explorer depending on the available CFG analyses (see
+/// MustBeExecutedContextExplorer). Also note that we, depending on the options,
+/// the visit set can contain instructions from other functions.
+struct MustBeExecutedIterator {
+  /// Type declarations that make his class an input iterator.
+  ///{
+  typedef const Instruction *value_type;
+  typedef std::ptrdiff_t difference_type;
+  typedef const Instruction **pointer;
+  typedef const Instruction *&reference;
+  typedef std::input_iterator_tag iterator_category;
+  ///}
+
+  using ExplorerTy = MustBeExecutedContextExplorer;
+
+  MustBeExecutedIterator(const MustBeExecutedIterator &Other)
+      : Visited(Other.Visited), Explorer(Other.Explorer),
+        CurInst(Other.CurInst) {}
+
+  MustBeExecutedIterator(MustBeExecutedIterator &&Other)
+      : Visited(std::move(Other.Visited)), Explorer(Other.Explorer),
+        CurInst(Other.CurInst) {}
+
+  MustBeExecutedIterator &operator=(MustBeExecutedIterator &&Other) {
+    if (this != &Other) {
+      std::swap(Visited, Other.Visited);
+      std::swap(CurInst, Other.CurInst);
+    }
+    return *this;
+  }
+
+  ~MustBeExecutedIterator() {}
+
+  /// Pre- and post-increment operators.
+  ///{
+  MustBeExecutedIterator &operator++() {
+    CurInst = advance();
+    return *this;
+  }
+
+  MustBeExecutedIterator operator++(int) {
+    MustBeExecutedIterator tmp(*this);
+    operator++();
+    return tmp;
+  }
+  ///}
+
+  /// Equality and inequality operators. Note that we ignore the history here.
+  ///{
+  bool operator==(const MustBeExecutedIterator &Other) const {
+    return CurInst == Other.CurInst;
+  }
+
+  bool operator!=(const MustBeExecutedIterator &Other) const {
+    return !(*this == Other);
+  }
+  ///}
+
+  /// Return the underlying instruction.
+  const Instruction *&operator*() { return CurInst; }
+  const Instruction *getCurrentInst() const { return CurInst; }
+
+  /// Return true if \p I was encountered by this iterator already.
+  bool count(const Instruction *I) const { return Visited.count(I); }
+
+private:
+  using VisitedSetTy = DenseSet<const Instruction *>;
+
+  /// Private constructors.
+  MustBeExecutedIterator(ExplorerTy &Explorer, const Instruction *I);
+
+  /// Reset the iterator to its initial state pointing at \p I.
+  void reset(const Instruction *I);
+
+  /// Try to advance one of the underlying positions (Head or Tail).
+  ///
+  /// \return The next instruction in the must be executed context, or nullptr
+  ///         if none was found.
+  const Instruction *advance();
+
+  /// A set to track the visited instructions in order to deal with endless
+  /// loops and recursion.
+  VisitedSetTy Visited;
+
+  /// A reference to the explorer that created this iterator.
+  ExplorerTy &Explorer;
+
+  /// The instruction we are currently exposing to the user. There is always an
+  /// instruction that we know is executed with the given program point,
+  /// initially the program point itself.
+  const Instruction *CurInst;
+
+  friend struct MustBeExecutedContextExplorer;
+};
+
+/// A "must be executed context" for a given program point PP is the set of
+/// instructions, potentially before and after PP, that are executed always when
+/// PP is reached. The MustBeExecutedContextExplorer an interface to explore
+/// "must be executed contexts" in a module through the use of
+/// MustBeExecutedIterator.
+///
+/// The explorer exposes "must be executed iterators" that traverse the must be
+/// executed context. There is little information sharing between iterators as
+/// the expected use case involves few iterators for "far apart" instructions.
+/// If that changes, we should consider caching more intermediate results.
+struct MustBeExecutedContextExplorer {
+
+  /// In the description of the parameters we use PP to denote a program point
+  /// for which the must be executed context is explored, or put differently,
+  /// for which the MustBeExecutedIterator is created.
+  ///
+  /// \param ExploreInterBlock    Flag to indicate if instructions in blocks
+  ///                             other than the parent of PP should be
+  ///                             explored.
+  MustBeExecutedContextExplorer(bool ExploreInterBlock)
+      : ExploreInterBlock(ExploreInterBlock), EndIterator(*this, nullptr) {}
+
+  /// Clean up the dynamically allocated iterators.
+  ~MustBeExecutedContextExplorer() {
+    DeleteContainerSeconds(InstructionIteratorMap);
+  }
+
+  /// Iterator-based interface. \see MustBeExecutedIterator.
+  ///{
+  using iterator = MustBeExecutedIterator;
+  using const_iterator = const MustBeExecutedIterator;
+
+  /// Return an iterator to explore the context around \p PP.
+  iterator &begin(const Instruction *PP) {
+    auto *&It = InstructionIteratorMap[PP];
+    if (!It)
+      It = new iterator(*this, PP);
+    return *It;
+  }
+
+  /// Return an iterator to explore the cached context around \p PP.
+  const_iterator &begin(const Instruction *PP) const {
+    return *InstructionIteratorMap.lookup(PP);
+  }
+
+  /// Return an universal end iterator.
+  ///{
+  iterator &end() { return EndIterator; }
+  iterator &end(const Instruction *) { return EndIterator; }
+
+  const_iterator &end() const { return EndIterator; }
+  const_iterator &end(const Instruction *) const { return EndIterator; }
+  ///}
+
+  /// Return an iterator range to explore the context around \p PP.
+  llvm::iterator_range<iterator> range(const Instruction *PP) {
+    return llvm::make_range(begin(PP), end(PP));
+  }
+
+  /// Return an iterator range to explore the cached context around \p PP.
+  llvm::iterator_range<const_iterator> range(const Instruction *PP) const {
+    return llvm::make_range(begin(PP), end(PP));
+  }
+  ///}
+
+  /// Return the next instruction that is guaranteed to be executed after \p PP.
+  ///
+  /// \param It              The iterator that is used to traverse the must be
+  ///                        executed context.
+  /// \param PP              The program point for which the next instruction
+  ///                        that is guaranteed to execute is determined.
+  const Instruction *
+  getMustBeExecutedNextInstruction(MustBeExecutedIterator &It,
+                                   const Instruction *PP);
+
+  /// Parameter that limit the performed exploration. See the constructor for
+  /// their meaning.
+  ///{
+  const bool ExploreInterBlock;
+  ///}
+
+private:
+  /// Map from instructions to associated must be executed iterators.
+  DenseMap<const Instruction *, MustBeExecutedIterator *>
+      InstructionIteratorMap;
+
+  /// A unique end iterator.
+  MustBeExecutedIterator EndIterator;
+};
+
+} // namespace llvm
 
 #endif
diff --git a/llvm/lib/Analysis/MustExecute.cpp b/llvm/lib/Analysis/MustExecute.cpp
--- a/llvm/lib/Analysis/MustExecute.cpp
+++ b/llvm/lib/Analysis/MustExecute.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/MustExecute.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/Passes.h"
@@ -21,6 +23,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "must-execute"
+
 const DenseMap<BasicBlock *, ColorVector> &
 LoopSafetyInfo::getBlockColors() const {
   return BlockColors;
@@ -430,3 +434,75 @@
 
   return false;
 }
+
+const Instruction *
+MustBeExecutedContextExplorer::getMustBeExecutedNextInstruction(
+    MustBeExecutedIterator &It, const Instruction *PP) {
+  if (!PP)
+    return PP;
+  LLVM_DEBUG(dbgs() << "Find next instruction for " << *PP << "\n");
+
+  // If we explore only inside a given basic block we stop at terminators.
+  if (!ExploreInterBlock && PP->isTerminator()) {
+    LLVM_DEBUG(dbgs() << "\tReached terminator in intra-block mode, done\n");
+    return nullptr;
+  }
+
+  // If we do not traverse the call graph we check if we can make progress in
+  // the current function. First, check if the instruction is guaranteed to
+  // transfer execution to the successor.
+  bool TransfersExecution = isGuaranteedToTransferExecutionToSuccessor(PP);
+  if (!TransfersExecution)
+    return nullptr;
+
+  // If this is not a terminator we know that there is a single instruction
+  // after this one that is executed next if control is transfered. If not,
+  // we can try to go back to a call site we entered earlier. If none exists, we
+  // do not know any instruction that has to be executd next.
+  if (!PP->isTerminator()) {
+    const Instruction *NextPP = PP->getNextNode();
+    LLVM_DEBUG(dbgs() << "\tIntermediate instruction does transfer control\n");
+    return NextPP;
+  }
+
+  // Finally, we have to handle terminators, trivial ones first.
+  assert(PP->isTerminator() && "Expected a terminator!");
+
+  // A terminator without a successor is not handled yet.
+  if (PP->getNumSuccessors() == 0) {
+    LLVM_DEBUG(dbgs() << "\tUnhandled terminator\n");
+    return nullptr;
+  }
+
+  // A terminator with a single successor, we will continue at the beginning of
+  // that one.
+  if (PP->getNumSuccessors() == 1) {
+    LLVM_DEBUG(
+        dbgs() << "\tUnconditional terminator, continue with successor\n");
+    return &PP->getSuccessor(0)->front();
+  }
+
+  LLVM_DEBUG(dbgs() << "\tNo join point found\n");
+  return nullptr;
+}
+
+MustBeExecutedIterator::MustBeExecutedIterator(
+    MustBeExecutedContextExplorer &Explorer, const Instruction *I)
+    : Explorer(Explorer), CurInst(I) {
+  reset(I);
+}
+
+void MustBeExecutedIterator::reset(const Instruction *I) {
+  CurInst = I;
+  Visited.clear();
+  Visited.insert(I);
+}
+
+const Instruction *MustBeExecutedIterator::advance() {
+  assert(CurInst && "Cannot advance an end iterator!");
+  const Instruction *Next =
+      Explorer.getMustBeExecutedNextInstruction(*this, CurInst);
+  if (Next && !Visited.insert(Next).second)
+    Next = nullptr;
+  return Next;
+}
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -47,6 +47,7 @@
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Argument.h"
@@ -566,8 +567,9 @@
 /// This method limits promotion of aggregates to only promote up to three
 /// elements of the aggregate in order to avoid exploding the number of
 /// arguments passed in.
-static bool isSafeToPromoteArgument(Argument *Arg, Type *ByValTy, AAResults &AAR,
-                                    unsigned MaxElements) {
+static bool isSafeToPromoteArgument(Argument *Arg, Type *ByValTy,
+                                    AAResults &AAR, unsigned MaxElements,
+                                    MustBeExecutedContextExplorer &Explorer) {
   using GEPIndicesSet = std::set<IndicesVector>;
 
   // Quick exit for unused arguments
@@ -618,21 +620,20 @@
     return true;
   };
 
-  // First, iterate the entry block and mark loads of (geps of) arguments as
-  // safe.
-  BasicBlock &EntryBlock = Arg->getParent()->front();
   // Declare this here so we can reuse it
   IndicesVector Indices;
-  for (Instruction &I : EntryBlock)
-    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
-      Value *V = LI->getPointerOperand();
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
+  // First, iterate over all instructions executed whenever the function is
+  // reached and mark loads of (geps of) arguments as safe.
+  BasicBlock &EntryBlock = Arg->getParent()->front();
+  for (const Instruction *I : Explorer.range(&EntryBlock.front()))
+    if (auto *LI = dyn_cast<LoadInst>(I)) {
+      const Value *V = LI->getPointerOperand();
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(V)) {
         V = GEP->getPointerOperand();
         if (V == Arg) {
           // This load actually loads (part of) Arg? Check the indices then.
           Indices.reserve(GEP->getNumIndices());
-          for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
-               II != IE; ++II)
+          for (auto II = GEP->idx_begin(), IE = GEP->idx_end(); II != IE; ++II)
             if (ConstantInt *CI = dyn_cast<ConstantInt>(*II))
               Indices.push_back(CI->getSExtValue());
             else
@@ -683,7 +684,8 @@
         // TODO: This runs the above loop over and over again for dead GEPs
         // Couldn't we just do increment the UI iterator earlier and erase the
         // use?
-        return isSafeToPromoteArgument(Arg, ByValTy, AAR, MaxElements);
+        return isSafeToPromoteArgument(Arg, ByValTy, AAR, MaxElements,
+                                       Explorer);
       }
 
       if (!UpdateBaseTy(GEP->getSourceElementType()))
@@ -926,6 +928,9 @@
 
   AAResults &AAR = AARGetter(*F);
 
+  // TODO: This should be removed once we deduce "dereferenceable" properly.
+  MustBeExecutedContextExplorer Explorer(/* ExploreInterBlock */ true);
+
   // Check to see which arguments are promotable.  If an argument is promotable,
   // add it to ArgsToPromote.
   SmallPtrSet<Argument *, 8> ArgsToPromote;
@@ -1001,7 +1006,7 @@
     // Otherwise, see if we can promote the pointer to its value.
     Type *ByValTy =
         PtrArg->hasByValAttr() ? PtrArg->getParamByValType() : nullptr;
-    if (isSafeToPromoteArgument(PtrArg, ByValTy, AAR, MaxElements))
+    if (isSafeToPromoteArgument(PtrArg, ByValTy, AAR, MaxElements, Explorer))
       ArgsToPromote.insert(PtrArg);
   }
 
diff --git a/llvm/test/Transforms/ArgumentPromotion/executed_in_entry.ll b/llvm/test/Transforms/ArgumentPromotion/executed_in_entry.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/ArgumentPromotion/executed_in_entry.ll
@@ -0,0 +1,87 @@
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s  -aa-pipeline='basic-aa' -passes=argpromotion -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.S = type { i32, i32 }
+
+; Make sure we do not assume @unknown() is going to return which would
+; allow us to pre-load %s in @caller. (PR42039)
+define i32 @caller(%struct.S* %s) {
+entry:
+; CHECK: %call = call i32 @local(%struct.S* %s)
+  %call = call i32 @local(%struct.S* %s)
+  ret i32 %call
+}
+
+define internal i32 @local(%struct.S* noalias %s) {
+entry:
+  call void @unknown()
+  %a = getelementptr inbounds %struct.S, %struct.S* %s, i64 0, i32 0
+  %0 = load i32, i32* %a, align 4
+  %b = getelementptr inbounds %struct.S, %struct.S* %s, i64 0, i32 1
+  %1 = load i32, i32* %b, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+declare void @unknown() nounwind
+
+
+; Make sure we do assume @unknown_willreturn() is going to return
+; which allows us to pre-load %s in @caller_willreturn.
+define i32 @caller_willreturn(%struct.S* %s) {
+entry:
+; CHECK: %call = call i32 @local_willreturn(i32 %{{.*}}, i32 %{{.*}})
+  %call = call i32 @local_willreturn(%struct.S* %s)
+  ret i32 %call
+}
+
+define internal i32 @local_willreturn(%struct.S* noalias %s) {
+entry:
+  call void @unknown_willreturn()
+  %g0 = getelementptr inbounds %struct.S, %struct.S* %s, i64 0, i32 0
+  %l0 = load i32, i32* %g0, align 4
+  %g1 = getelementptr inbounds %struct.S, %struct.S* %s, i64 0, i32 1
+  %l1 = load i32, i32* %g1, align 4
+  %add = add nsw i32 %l0, %l1
+  ret i32 %add
+}
+
+declare void @unknown_willreturn() willreturn nounwind
+
+
+; We should promote %s as it is for sure executed in
+; @caller_must_be_executed but not %p as it is not always accessed (completely).
+define i32 @caller_must_be_executed(%struct.S* %s, %struct.S* %p, i1 %c) {
+entry:
+; TODO: %call = call i32 @local_must_be_executed(i32 %{{.*}}, i32 %{{.*}}, %struct.S* %p, i1 %c)
+  %call = call i32 @local_must_be_executed(%struct.S* %s, %struct.S* %p, i1 %c)
+  ret i32 %call
+}
+
+define internal i32 @local_must_be_executed(%struct.S* %s, %struct.S* %p, i1 %c) {
+entry:
+  %sg0 = getelementptr inbounds %struct.S, %struct.S* %s, i64 0, i32 0
+  %sv0 = load i32, i32* %sg0, align 4
+  br i1 %c, label %then, label %else
+
+then:
+  %pg0 = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 0
+  %pv0 = load i32, i32* %pg0, align 4
+  %add0 = add nsw i32 %sv0, %pv0
+  br label %merge
+
+else:
+  %pg1 = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 1
+  %pv1 = load i32, i32* %pg1, align 4
+  %add1 = add nsw i32 %sv0, %pv1
+  br label %merge
+
+merge:
+  %phi = phi i32 [%add0, %then], [%add1, %else]
+  %sg1 = getelementptr inbounds %struct.S, %struct.S* %s, i64 0, i32 1
+  %sv1 = load i32, i32* %sg1, align 4
+  %add = add nsw i32 %phi, %sv1
+  ret i32 %add
+}