Index: llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
===================================================================
--- llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -91,43 +91,75 @@
 #define DEBUG_TYPE "argpromotion"
 
 STATISTIC(NumArgumentsPromoted, "Number of pointer arguments promoted");
-STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted");
 STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted");
 STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated");
 
-/// A vector used to hold the indices of a single GEP instruction
-using IndicesVector = std::vector<uint64_t>;
+struct ArgPart {
+  Type *Ty;
+  Align Alignment;
+  /// A representative guaranteed-executed load instruction for use by
+  /// metadata transfer.
+  LoadInst *MustExecLoad;
+};
+using OffsetAndArgPart = std::pair<int64_t, ArgPart>;
+
+static Value *createByteGEP(IRBuilderBase &IRB, const DataLayout &DL,
+                            Value *Ptr, Type *ResElemTy, int64_t Offset) {
+  // For non-opaque pointers, try create a "nice" GEP if possible, otherwise
+  // fall back to an i8 GEP to a specific offset.
+  unsigned AddrSpace = Ptr->getType()->getPointerAddressSpace();
+  APInt OrigOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), Offset);
+  if (!Ptr->getType()->isOpaquePointerTy()) {
+    Type *OrigElemTy = Ptr->getType()->getNonOpaquePointerElementType();
+    if (OrigOffset == 0 && OrigElemTy == ResElemTy)
+      return Ptr;
+
+    APInt TmpOffset = OrigOffset;
+    Type *TmpTy = OrigElemTy;
+    SmallVector<APInt> IntIndices =
+        DL.getGEPIndicesForOffset(TmpTy, TmpOffset);
+    if (TmpOffset == 0) {
+      // Try to add trailing zero indices to reach the right type.
+      while (TmpTy != ResElemTy) {
+        Type *NextTy = GetElementPtrInst::getTypeAtIndex(TmpTy, (uint64_t)0);
+        if (!NextTy)
+          break;
+
+        IntIndices.push_back(APInt::getZero(
+            isa<StructType>(TmpTy) ? 32 : OrigOffset.getBitWidth()));
+        TmpTy = NextTy;
+      }
+
+      SmallVector<Value *> Indices;
+      for (const APInt &Index : IntIndices)
+        Indices.push_back(IRB.getInt(Index));
+
+      Ptr = IRB.CreateGEP(OrigElemTy, Ptr, Indices);
+      return IRB.CreateBitCast(Ptr, ResElemTy->getPointerTo(AddrSpace));
+    }
+  }
+
+  if (OrigOffset != 0) {
+    Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(AddrSpace));
+    Ptr = IRB.CreateGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(OrigOffset));
+  }
+  return IRB.CreateBitCast(Ptr, ResElemTy->getPointerTo(AddrSpace));
+}
 
 /// DoPromotion - This method actually performs the promotion of the specified
 /// arguments, and returns the new function.  At this point, we know that it's
 /// safe to do so.
-static Function *
-doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
-            SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
-            Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
-                ReplaceCallSite) {
+static Function *doPromotion(
+    Function *F,
+    const DenseMap<Argument *, SmallVector<OffsetAndArgPart, 4>> &ArgsToPromote,
+    SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
+    Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
+        ReplaceCallSite) {
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but has modified arguments.
   FunctionType *FTy = F->getFunctionType();
   std::vector<Type *> Params;
 
-  using ScalarizeTable = std::set<std::pair<Type *, IndicesVector>>;
-
-  // ScalarizedElements - If we are promoting a pointer that has elements
-  // accessed out of it, keep track of which elements are accessed so that we
-  // can add one argument for each.
-  //
-  // Arguments that are directly loaded will have a zero element value here, to
-  // handle cases where there are both a direct load and GEP accesses.
-  std::map<Argument *, ScalarizeTable> ScalarizedElements;
-
-  // OriginalLoads - Keep track of a representative load instruction from the
-  // original function so that we can tell the alias analysis implementation
-  // what the new GEP/Load instructions we are inserting look like.
-  // We need to keep the original loads for each argument and the elements
-  // of the argument that are accessed.
-  std::map<std::pair<Argument *, IndicesVector>, LoadInst *> OriginalLoads;
-
   // Attribute - Keep track of the parameter attributes for the arguments
   // that we are *not* promoting. For the ones that we do promote, the parameter
   // attributes are lost
@@ -154,58 +186,12 @@
       // Dead argument (which are always marked as promotable)
       ++NumArgumentsDead;
     } else {
-      // Okay, this is being promoted. This means that the only uses are loads
-      // or GEPs which are only used by loads
-
-      // In this table, we will track which indices are loaded from the argument
-      // (where direct loads are tracked as no indices).
-      ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
-      for (User *U : make_early_inc_range(I->users())) {
-        Instruction *UI = cast<Instruction>(U);
-        Type *SrcTy;
-        if (LoadInst *L = dyn_cast<LoadInst>(UI))
-          SrcTy = L->getType();
-        else
-          SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType();
-        // Skip dead GEPs and remove them.
-        if (isa<GetElementPtrInst>(UI) && UI->use_empty()) {
-          UI->eraseFromParent();
-          continue;
-        }
-
-        IndicesVector Indices;
-        Indices.reserve(UI->getNumOperands() - 1);
-        // Since loads will only have a single operand, and GEPs only a single
-        // non-index operand, this will record direct loads without any indices,
-        // and gep+loads with the GEP indices.
-        for (const Use &I : llvm::drop_begin(UI->operands()))
-          Indices.push_back(cast<ConstantInt>(I)->getSExtValue());
-        // GEPs with a single 0 index can be merged with direct loads
-        if (Indices.size() == 1 && Indices.front() == 0)
-          Indices.clear();
-        ArgIndices.insert(std::make_pair(SrcTy, Indices));
-        LoadInst *OrigLoad;
-        if (LoadInst *L = dyn_cast<LoadInst>(UI))
-          OrigLoad = L;
-        else
-          // Take any load, we will use it only to update Alias Analysis
-          OrigLoad = cast<LoadInst>(UI->user_back());
-        OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad;
-      }
-
-      // Add a parameter to the function for each element passed in.
-      for (const auto &ArgIndex : ArgIndices) {
-        // not allowed to dereference ->begin() if size() is 0
-        Params.push_back(GetElementPtrInst::getIndexedType(
-            I->getType()->getPointerElementType(), ArgIndex.second));
+      const auto &ArgParts = ArgsToPromote.find(&*I)->second;
+      for (const auto &Pair : ArgParts) {
+        Params.push_back(Pair.second.Ty);
         ArgAttrVec.push_back(AttributeSet());
-        assert(Params.back());
       }
-
-      if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty())
-        ++NumArgumentsPromoted;
-      else
-        ++NumAggregatesPromoted;
+      ++NumArgumentsPromoted;
     }
   }
 
@@ -277,44 +263,20 @@
           ArgAttrVec.push_back(AttributeSet());
         }
       } else if (!I->use_empty()) {
-        // Non-dead argument: insert GEPs and loads as appropriate.
-        ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
-        // Store the Value* version of the indices in here, but declare it now
-        // for reuse.
-        std::vector<Value *> Ops;
-        for (const auto &ArgIndex : ArgIndices) {
-          Value *V = *AI;
-          LoadInst *OrigLoad =
-              OriginalLoads[std::make_pair(&*I, ArgIndex.second)];
-          if (!ArgIndex.second.empty()) {
-            Ops.reserve(ArgIndex.second.size());
-            Type *ElTy = V->getType();
-            for (auto II : ArgIndex.second) {
-              // Use i32 to index structs, and i64 for others (pointers/arrays).
-              // This satisfies GEP constraints.
-              Type *IdxTy =
-                  (ElTy->isStructTy() ? Type::getInt32Ty(F->getContext())
-                                      : Type::getInt64Ty(F->getContext()));
-              Ops.push_back(ConstantInt::get(IdxTy, II));
-              // Keep track of the type we're currently indexing.
-              if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
-                ElTy = ElPTy->getPointerElementType();
-              else
-                ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, II);
-            }
-            // And create a GEP to extract those indices.
-            V = IRB.CreateGEP(ArgIndex.first, V, Ops, V->getName() + ".idx");
-            Ops.clear();
+        Value *V = *AI;
+        const auto &ArgParts = ArgsToPromote.find(&*I)->second;
+        for (const auto &Pair : ArgParts) {
+          LoadInst *LI = IRB.CreateAlignedLoad(
+              Pair.second.Ty,
+              createByteGEP(IRB, DL, V, Pair.second.Ty, Pair.first),
+              Pair.second.Alignment, V->getName() + ".val");
+          if (Pair.second.MustExecLoad) {
+            // TODO: Transfer other metadata like !nonnull here.
+            // TODO: Blindly transferring AA metadata in this way is incorrect.
+            // E.g. TBAA may require offset adjustment.
+            LI->setAAMetadata(Pair.second.MustExecLoad->getAAMetadata());
           }
-          // Since we're replacing a load make sure we take the alignment
-          // of the previous load.
-          LoadInst *newLoad =
-              IRB.CreateLoad(OrigLoad->getType(), V, V->getName() + ".val");
-          newLoad->setAlignment(OrigLoad->getAlign());
-          // Transfer the AA info too.
-          newLoad->setAAMetadata(OrigLoad->getAAMetadata());
-
-          Args.push_back(newLoad);
+          Args.push_back(LI);
           ArgAttrVec.push_back(AttributeSet());
         }
       }
@@ -416,57 +378,46 @@
     if (Arg.use_empty())
       continue;
 
+    SmallDenseMap<int64_t, Argument *> OffsetToArg;
+    for (const auto &Pair : ArgsToPromote.find(&Arg)->second) {
+      Argument &NewArg = *I2++;
+      NewArg.setName(Arg.getName() + "." + Twine(Pair.first) + ".val");
+      OffsetToArg.insert({Pair.first, &NewArg});
+    }
+
     // Otherwise, if we promoted this argument, then all users are load
-    // instructions (or GEPs with only load users), and all loads should be
-    // using the new argument that we added.
-    ScalarizeTable &ArgIndices = ScalarizedElements[&Arg];
-
-    while (!Arg.use_empty()) {
-      if (LoadInst *LI = dyn_cast<LoadInst>(Arg.user_back())) {
-        assert(ArgIndices.begin()->second.empty() &&
-               "Load element should sort to front!");
-        I2->setName(Arg.getName() + ".val");
-        LI->replaceAllUsesWith(&*I2);
-        LI->eraseFromParent();
-        LLVM_DEBUG(dbgs() << "*** Promoted load of argument '" << Arg.getName()
-                          << "' in function '" << F->getName() << "'\n");
-      } else {
-        GetElementPtrInst *GEP = cast<GetElementPtrInst>(Arg.user_back());
-        assert(!GEP->use_empty() &&
-               "GEPs without uses should be cleaned up already");
-        IndicesVector Operands;
-        Operands.reserve(GEP->getNumIndices());
-        for (const Use &Idx : GEP->indices())
-          Operands.push_back(cast<ConstantInt>(Idx)->getSExtValue());
-
-        // GEPs with a single 0 index can be merged with direct loads
-        if (Operands.size() == 1 && Operands.front() == 0)
-          Operands.clear();
-
-        Function::arg_iterator TheArg = I2;
-        for (ScalarizeTable::iterator It = ArgIndices.begin();
-             It->second != Operands; ++It, ++TheArg) {
-          assert(It != ArgIndices.end() && "GEP not handled??");
-        }
+    // instructions (with possible casts and GEPs in between).
+
+    SmallVector<Value *, 16> Worklist;
+    SmallVector<Instruction *, 16> DeadInsts;
+    append_range(Worklist, Arg.users());
+    while (!Worklist.empty()) {
+      Value *V = Worklist.pop_back_val();
+      if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V)) {
+        DeadInsts.push_back(cast<Instruction>(V));
+        append_range(Worklist, V->users());
+        continue;
+      }
 
-        TheArg->setName(formatv("{0}.{1:$[.]}.val", Arg.getName(),
-                                make_range(Operands.begin(), Operands.end())));
+      if (auto *LI = dyn_cast<LoadInst>(V)) {
+        Value *Ptr = LI->getPointerOperand();
+        APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+        Ptr =
+            Ptr->stripAndAccumulateConstantOffsets(DL, Offset,
+                                                   /* AllowNonInbounds */ true);
+        assert(Ptr == &Arg && "Not constant offset from arg?");
+        LI->replaceAllUsesWith(OffsetToArg[Offset.getSExtValue()]);
+        DeadInsts.push_back(LI);
+        continue;
+      }
 
-        LLVM_DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
-                          << "' of function '" << NF->getName() << "'\n");
+      llvm_unreachable("Unexpected user");
+    }
 
-        // All of the uses must be load instructions.  Replace them all with
-        // the argument specified by ArgNo.
-        while (!GEP->use_empty()) {
-          LoadInst *L = cast<LoadInst>(GEP->user_back());
-          L->replaceAllUsesWith(&*TheArg);
-          L->eraseFromParent();
-        }
-        GEP->eraseFromParent();
-      }
+    for (Instruction *I : DeadInsts) {
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+      I->eraseFromParent();
     }
-    // Increment I2 past all of the arguments added for this promoted pointer.
-    std::advance(I2, ArgIndices.size());
   }
 
   return NF;
@@ -474,100 +425,39 @@
 
 /// Return true if we can prove that all callees pass in a valid pointer for the
 /// specified function argument.
-static bool allCallersPassValidPointerForArgument(Argument *Arg, Type *Ty) {
-  Function *Callee = Arg->getParent();
-  const DataLayout &DL = Callee->getParent()->getDataLayout();
-
-  unsigned ArgNo = Arg->getArgNo();
+static bool allCallersPassValidPointerForArgument(Argument *Arg,
+                                                  Align NeededAlign,
+                                                  uint64_t NeededDerefBytes,
+                                                  const DataLayout &DL) {
+  // Check if the argument itself is marked dereferenceable and aligned.
+  APInt Bytes(64, NeededDerefBytes);
+  if (isDereferenceableAndAlignedPointer(Arg, NeededAlign, Bytes, DL))
+    return true;
 
   // Look at all call sites of the function.  At this point we know we only have
   // direct callees.
+  Function *Callee = Arg->getParent();
+  unsigned ArgNo = Arg->getArgNo();
   for (User *U : Callee->users()) {
     CallBase &CB = cast<CallBase>(*U);
-
-    if (!isDereferenceablePointer(CB.getArgOperand(ArgNo), Ty, DL))
+    if (!isDereferenceableAndAlignedPointer(CB.getArgOperand(ArgNo),
+                                            NeededAlign, Bytes, DL))
       return false;
   }
   return true;
 }
 
-/// Returns true if Prefix is a prefix of longer. That means, Longer has a size
-/// that is greater than or equal to the size of prefix, and each of the
-/// elements in Prefix is the same as the corresponding elements in Longer.
-///
-/// This means it also returns true when Prefix and Longer are equal!
-static bool isPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) {
-  if (Prefix.size() > Longer.size())
-    return false;
-  return std::equal(Prefix.begin(), Prefix.end(), Longer.begin());
-}
-
-/// Checks if Indices, or a prefix of Indices, is in Set.
-static bool prefixIn(const IndicesVector &Indices,
-                     std::set<IndicesVector> &Set) {
-  std::set<IndicesVector>::iterator Low;
-  Low = Set.upper_bound(Indices);
-  if (Low != Set.begin())
-    Low--;
-  // Low is now the last element smaller than or equal to Indices. This means
-  // it points to a prefix of Indices (possibly Indices itself), if such
-  // prefix exists.
-  //
-  // This load is safe if any prefix of its operands is safe to load.
-  return Low != Set.end() && isPrefix(*Low, Indices);
-}
-
-/// Mark the given indices (ToMark) as safe in the given set of indices
-/// (Safe). Marking safe usually means adding ToMark to Safe. However, if there
-/// is already a prefix of Indices in Safe, Indices are implicitely marked safe
-/// already. Furthermore, any indices that Indices is itself a prefix of, are
-/// removed from Safe (since they are implicitely safe because of Indices now).
-static void markIndicesSafe(const IndicesVector &ToMark,
-                            std::set<IndicesVector> &Safe) {
-  std::set<IndicesVector>::iterator Low;
-  Low = Safe.upper_bound(ToMark);
-  // Guard against the case where Safe is empty
-  if (Low != Safe.begin())
-    Low--;
-  // Low is now the last element smaller than or equal to Indices. This
-  // means it points to a prefix of Indices (possibly Indices itself), if
-  // such prefix exists.
-  if (Low != Safe.end()) {
-    if (isPrefix(*Low, ToMark))
-      // If there is already a prefix of these indices (or exactly these
-      // indices) marked a safe, don't bother adding these indices
-      return;
-
-    // Increment Low, so we can use it as a "insert before" hint
-    ++Low;
-  }
-  // Insert
-  Low = Safe.insert(Low, ToMark);
-  ++Low;
-  // If there we're a prefix of longer index list(s), remove those
-  std::set<IndicesVector>::iterator End = Safe.end();
-  while (Low != End && isPrefix(ToMark, *Low)) {
-    std::set<IndicesVector>::iterator Remove = Low;
-    ++Low;
-    Safe.erase(Remove);
-  }
-}
-
-/// isSafeToPromoteArgument - As you might guess from the name of this method,
-/// it checks to see if it is both safe and useful to promote the argument.
-/// This method limits promotion of aggregates to only promote up to three
-/// elements of the aggregate in order to avoid exploding the number of
-/// arguments passed in.
-static bool isSafeToPromoteArgument(Argument *Arg, Type *ByValTy, AAResults &AAR,
-                                    unsigned MaxElements) {
-  using GEPIndicesSet = std::set<IndicesVector>;
-
+/// Determine that this argument is safe to promote, and find the argument
+/// parts it can be promoted into.
+static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
+                         unsigned MaxElements, bool IsSelfRecursive,
+                         SmallVectorImpl<OffsetAndArgPart> &ArgPartsVec) {
   // Quick exit for unused arguments
   if (Arg->use_empty())
     return true;
 
-  // We can only promote this argument if all of the uses are loads, or are GEP
-  // instructions (with constant indices) that are subsequently loaded.
+  // We can only promote this argument if all of the uses are loads at known
+  // offsets.
   //
   // Promoting the argument causes it to be loaded in the caller
   // unconditionally. This is only safe if we can prove that either the load
@@ -578,156 +468,156 @@
   // anyway, in the latter case, invalid loads won't happen. This prevents us
   // from introducing an invalid load that wouldn't have happened in the
   // original code.
-  //
-  // This set will contain all sets of indices that are loaded in the entry
-  // block, and thus are safe to unconditionally load in the caller.
-  GEPIndicesSet SafeToUnconditionallyLoad;
-
-  // This set contains all the sets of indices that we are planning to promote.
-  // This makes it possible to limit the number of arguments added.
-  GEPIndicesSet ToPromote;
-
-  // If the pointer is always valid, any load with first index 0 is valid.
-
-  if (ByValTy)
-    SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
-
-  // Whenever a new underlying type for the operand is found, make sure it's
-  // consistent with the GEPs and loads we've already seen and, if necessary,
-  // use it to see if all incoming pointers are valid (which implies the 0-index
-  // is safe).
-  Type *BaseTy = ByValTy;
-  auto UpdateBaseTy = [&](Type *NewBaseTy) {
-    if (BaseTy)
-      return BaseTy == NewBaseTy;
-
-    BaseTy = NewBaseTy;
-    if (allCallersPassValidPointerForArgument(Arg, BaseTy)) {
-      assert(SafeToUnconditionallyLoad.empty());
-      SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
+
+  SmallDenseMap<int64_t, ArgPart, 4> ArgParts;
+  Align NeededAlign(1);
+  uint64_t NeededDerefBytes = 0;
+
+  // Returns None if this load is not based on the argument. Return true if
+  // we can promote the load, false otherwise.
+  auto HandleLoad = [&](LoadInst *LI,
+                        bool GuaranteedToExecute) -> Optional<bool> {
+    Value *Ptr = LI->getPointerOperand();
+    APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+    Ptr = Ptr->stripAndAccumulateConstantOffsets(DL, Offset,
+                                                 /* AllowNonInbounds */ true);
+    if (Ptr != Arg)
+      return None;
+
+    if (Offset.getSignificantBits() >= 64)
+      return false;
+
+    Type *Ty = LI->getType();
+    TypeSize Size = DL.getTypeStoreSize(Ty);
+    // Don't try to promote scalable types.
+    if (Size.isScalable())
+      return false;
+
+    // If this is a self-recursive function and one of the types is a pointer,
+    // then promoting it might lead to recursive promotion.
+    if (IsSelfRecursive && Ty->isPointerTy())
+      return false;
+
+    int64_t Off = Offset.getSExtValue();
+    auto Pair = ArgParts.try_emplace(
+        Off, ArgPart{Ty, LI->getAlign(), GuaranteedToExecute ? LI : nullptr});
+    ArgPart &Part = Pair.first->second;
+
+    // We limit promotion to only promoting up to a fixed number of elements of
+    // the aggregate.
+    if (MaxElements > 0 && ArgParts.size() >= MaxElements) {
+      LLVM_DEBUG(dbgs() << "ArgPromotion of " << *Arg << " failed: "
+                        << "more than " << MaxElements << " parts\n");
+      return false;
     }
 
-    return true;
-  };
+    // For now, we only support loading one specific type at a given offset.
+    if (Part.Ty != Ty) {
+      LLVM_DEBUG(dbgs() << "ArgPromotion of " << *Arg << " failed: "
+                        << "loaded via both " << *Part.Ty << " and " << *Ty
+                        << " at offset " << Off << "\n");
+      return false;
+    }
 
-  // First, iterate functions that are guaranteed to execution on function
-  // entry and mark loads of (geps of) arguments as safe.
-  BasicBlock &EntryBlock = Arg->getParent()->front();
-  // Declare this here so we can reuse it
-  IndicesVector Indices;
-  for (Instruction &I : EntryBlock) {
-    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
-      Value *V = LI->getPointerOperand();
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
-        V = GEP->getPointerOperand();
-        if (V == Arg) {
-          // This load actually loads (part of) Arg? Check the indices then.
-          Indices.reserve(GEP->getNumIndices());
-          for (Use &Idx : GEP->indices())
-            if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
-              Indices.push_back(CI->getSExtValue());
-            else
-              // We found a non-constant GEP index for this argument? Bail out
-              // right away, can't promote this argument at all.
-              return false;
-
-          if (!UpdateBaseTy(GEP->getSourceElementType()))
-            return false;
-
-          // Indices checked out, mark them as safe
-          markIndicesSafe(Indices, SafeToUnconditionallyLoad);
-          Indices.clear();
-        }
-      } else if (V == Arg) {
-        // Direct loads are equivalent to a GEP with a single 0 index.
-        markIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad);
+    // If this load is not guaranteed to execute and we haven't seen a load at
+    // this offset before (or it had lower alignment), then we need to remember
+    // that requirement.
+    if (!GuaranteedToExecute &&
+        (Pair.second || Part.Alignment < LI->getAlign())) {
+      // We won't be able to prove dereferenceability for negative offsets.
+      if (Off < 0)
+        return false;
 
-        if (BaseTy && LI->getType() != BaseTy)
-          return false;
+      // If the offset is not aligned, an aligned base pointer won't help.
+      if (!isAligned(LI->getAlign(), Off))
+        return false;
 
-        BaseTy = LI->getType();
-      }
+      NeededDerefBytes = std::max(NeededDerefBytes, Off + Size.getFixedValue());
+      NeededAlign = std::max(NeededAlign, LI->getAlign());
     }
 
+    Part.Alignment = std::max(Part.Alignment, LI->getAlign());
+    return true;
+  };
+
+  // Look for loads that are guaranteed to execute on entry.
+  for (Instruction &I : Arg->getParent()->getEntryBlock()) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I))
+      if (Optional<bool> Res = HandleLoad(LI, /* GuaranteedToExecute */ true))
+        if (!Res)
+          return false;
+
     if (!isGuaranteedToTransferExecutionToSuccessor(&I))
       break;
   }
 
-  // Now, iterate all uses of the argument to see if there are any uses that are
-  // not (GEP+)loads, or any (GEP+)loads that are not safe to promote.
+  // Now look at all loads of the argument. Remember the load instructions
+  // for the aliasing check below.
+  SmallVector<Value *, 16> Worklist;
+  SmallPtrSet<Value *, 16> Visited;
   SmallVector<LoadInst *, 16> Loads;
-  IndicesVector Operands;
-  for (Use &U : Arg->uses()) {
-    User *UR = U.getUser();
-    Operands.clear();
-    if (LoadInst *LI = dyn_cast<LoadInst>(UR)) {
-      // Don't hack volatile/atomic loads
-      if (!LI->isSimple())
-        return false;
-      Loads.push_back(LI);
-      // Direct loads are equivalent to a GEP with a zero index and then a load.
-      Operands.push_back(0);
+  auto AppendUsers = [&](Value *V) {
+    for (User *U : V->users())
+      if (Visited.insert(U).second)
+        Worklist.push_back(U);
+  };
+  AppendUsers(Arg);
+  while (!Worklist.empty()) {
+    Value *V = Worklist.pop_back_val();
+    if (isa<BitCastInst>(V)) {
+      AppendUsers(V);
+      continue;
+    }
 
-      if (!UpdateBaseTy(LI->getType()))
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(V)) {
+      if (!GEP->hasAllConstantIndices())
         return false;
-    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UR)) {
-      if (GEP->use_empty()) {
-        // Dead GEP's cause trouble later.  Just remove them if we run into
-        // them.
-        continue;
-      }
+      AppendUsers(V);
+      continue;
+    }
 
-      if (!UpdateBaseTy(GEP->getSourceElementType()))
+    if (auto *LI = dyn_cast<LoadInst>(V)) {
+      if (!*HandleLoad(LI, /* GuaranteedToExecute */ false))
         return false;
-
-      // Ensure that all of the indices are constants.
-      for (Use &Idx : GEP->indices())
-        if (ConstantInt *C = dyn_cast<ConstantInt>(Idx))
-          Operands.push_back(C->getSExtValue());
-        else
-          return false; // Not a constant operand GEP!
-
-      // Ensure that the only users of the GEP are load instructions.
-      for (User *GEPU : GEP->users())
-        if (LoadInst *LI = dyn_cast<LoadInst>(GEPU)) {
-          // Don't hack volatile/atomic loads
-          if (!LI->isSimple())
-            return false;
-          Loads.push_back(LI);
-        } else {
-          // Other uses than load?
-          return false;
-        }
-    } else {
-      return false; // Not a load or a GEP.
+      Loads.push_back(LI);
+      continue;
     }
 
-    // Now, see if it is safe to promote this load / loads of this GEP. Loading
-    // is safe if Operands, or a prefix of Operands, is marked as safe.
-    if (!prefixIn(Operands, SafeToUnconditionallyLoad))
-      return false;
+    // Unknown user.
+    LLVM_DEBUG(dbgs() << "ArgPromotion of " << *Arg << " failed: "
+                      << "unknown user " << *V << "\n");
+    return false;
+  }
 
-    // See if we are already promoting a load with these indices. If not, check
-    // to make sure that we aren't promoting too many elements.  If so, nothing
-    // to do.
-    if (ToPromote.find(Operands) == ToPromote.end()) {
-      if (MaxElements > 0 && ToPromote.size() == MaxElements) {
-        LLVM_DEBUG(dbgs() << "argpromotion not promoting argument '"
-                          << Arg->getName()
-                          << "' because it would require adding more "
-                          << "than " << MaxElements
-                          << " arguments to the function.\n");
-        // We limit aggregate promotion to only promoting up to a fixed number
-        // of elements of the aggregate.
-        return false;
-      }
-      ToPromote.insert(std::move(Operands));
+  if (NeededDerefBytes || NeededAlign > 1) {
+    // Try to prove a required deref / aligned requirement.
+    if (!allCallersPassValidPointerForArgument(Arg, NeededAlign,
+                                               NeededDerefBytes, DL)) {
+      LLVM_DEBUG(dbgs() << "ArgPromotion of " << *Arg << " failed: "
+                        << "not dereferenceable or aligned\n");
+      return false;
     }
   }
 
-  if (Loads.empty())
+  if (ArgParts.empty())
     return true; // No users, this is a dead argument.
 
+  // Sort parts by offset.
+  append_range(ArgPartsVec, ArgParts);
+  sort(ArgPartsVec,
+       [](const auto &A, const auto &B) { return A.first < B.first; });
+
+  // Make sure the parts are non-overlapping.
+  // TODO: As we're doing pure load promotion here, overlap should be fine from
+  // a correctness perspective. Profitability is less obvious though.
+  int64_t Offset = ArgPartsVec[0].first;
+  for (const auto &Pair : ArgPartsVec) {
+    if (Pair.first < Offset)
+      return false; // Overlap with previous part.
+
+    Offset = Pair.first + DL.getTypeStoreSize(Pair.second.Ty);
+  }
+
   // Okay, now we know that the argument is only used by load instructions and
   // it is safe to unconditionally perform all of them. Use alias analysis to
   // check to see if the pointer is guaranteed to not be modified from entry of
@@ -834,30 +724,19 @@
   return false;
 }
 
-/// Check if callers and the callee \p F agree how promoted arguments would be
-/// passed. The ones that they do not agree on are eliminated from the sets but
-/// the return value has to be observed as well.
-static bool areFunctionArgsABICompatible(
-    const Function &F, const TargetTransformInfo &TTI,
-    SmallPtrSetImpl<Argument *> &ArgsToPromote,
-    SmallPtrSetImpl<Argument *> &ByValArgsToTransform) {
-  // TODO: Check individual arguments so we can promote a subset?
-  SmallVector<Type *, 32> Types;
-  for (Argument *Arg : ArgsToPromote)
-    Types.push_back(Arg->getType()->getPointerElementType());
-  for (Argument *Arg : ByValArgsToTransform)
-    Types.push_back(Arg->getParamByValType());
-
-  for (const Use &U : F.uses()) {
+/// Check if callers and callee agree on how promoted arguments would be
+/// passed.
+static bool areTypesABICompatible(ArrayRef<Type *> Types, const Function &F,
+                                  const TargetTransformInfo &TTI) {
+  return all_of(F.uses(), [&](const Use &U) {
     CallBase *CB = dyn_cast<CallBase>(U.getUser());
     if (!CB)
       return false;
+
     const Function *Caller = CB->getCaller();
     const Function *Callee = CB->getCalledFunction();
-    if (!TTI.areTypesABICompatible(Caller, Callee, Types))
-      return false;
-  }
-  return true;
+    return TTI.areTypesABICompatible(Caller, Callee, Types);
+  });
 }
 
 /// PromoteArguments - This method checks the specified function to see if there
@@ -931,11 +810,9 @@
 
   // Check to see which arguments are promotable.  If an argument is promotable,
   // add it to ArgsToPromote.
-  SmallPtrSet<Argument *, 8> ArgsToPromote;
+  DenseMap<Argument *, SmallVector<OffsetAndArgPart, 4>> ArgsToPromote;
   SmallPtrSet<Argument *, 8> ByValArgsToTransform;
   for (Argument *PtrArg : PointerArgs) {
-    Type *AgTy = PtrArg->getType()->getPointerElementType();
-
     // Replace sret attribute with noalias. This reduces register pressure by
     // avoiding a register copy.
     if (PtrArg->hasStructRetAttr()) {
@@ -955,11 +832,13 @@
     //
     // Only handle arguments with specified alignment; if it's unspecified, the
     // actual alignment of the argument is target-specific.
-    bool isSafeToPromote = PtrArg->hasByValAttr() && PtrArg->getParamAlign() &&
-                           (ArgumentPromotionPass::isDenselyPacked(AgTy, DL) ||
-                            !canPaddingBeAccessed(PtrArg));
+    Type *ByValTy = PtrArg->getParamByValType();
+    bool isSafeToPromote =
+        ByValTy && PtrArg->getParamAlign() &&
+        (ArgumentPromotionPass::isDenselyPacked(ByValTy, DL) ||
+         !canPaddingBeAccessed(PtrArg));
     if (isSafeToPromote) {
-      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
+      if (StructType *STy = dyn_cast<StructType>(ByValTy)) {
         if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
           LLVM_DEBUG(dbgs() << "argpromotion disable promoting argument '"
                             << PtrArg->getName()
@@ -969,51 +848,39 @@
           continue;
         }
 
+        SmallVector<Type *, 4> Types;
+        append_range(Types, STy->elements());
+
         // If all the elements are single-value types, we can promote it.
-        bool AllSimple = true;
-        for (const auto *EltTy : STy->elements()) {
-          if (!EltTy->isSingleValueType()) {
-            AllSimple = false;
-            break;
-          }
-        }
+        bool AllSimple =
+            all_of(Types, [](Type *Ty) { return Ty->isSingleValueType(); });
 
         // Safe to transform, don't even bother trying to "promote" it.
         // Passing the elements as a scalar will allow sroa to hack on
         // the new alloca we introduce.
-        if (AllSimple) {
+        if (AllSimple && areTypesABICompatible(Types, *F, TTI)) {
           ByValArgsToTransform.insert(PtrArg);
           continue;
         }
       }
     }
 
-    // If the argument is a recursive type and we're in a recursive
-    // function, we could end up infinitely peeling the function argument.
-    if (isSelfRecursive) {
-      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
-        bool RecursiveType =
-            llvm::is_contained(STy->elements(), PtrArg->getType());
-        if (RecursiveType)
-          continue;
-      }
-    }
-
     // Otherwise, see if we can promote the pointer to its value.
-    Type *ByValTy =
-        PtrArg->hasByValAttr() ? PtrArg->getParamByValType() : nullptr;
-    if (isSafeToPromoteArgument(PtrArg, ByValTy, AAR, MaxElements))
-      ArgsToPromote.insert(PtrArg);
+    SmallVector<OffsetAndArgPart, 4> ArgParts;
+    if (findArgParts(PtrArg, DL, AAR, MaxElements, isSelfRecursive, ArgParts)) {
+      SmallVector<Type *, 4> Types;
+      for (const auto &Pair : ArgParts)
+        Types.push_back(Pair.second.Ty);
+
+      if (areTypesABICompatible(Types, *F, TTI))
+        ArgsToPromote.insert({PtrArg, std::move(ArgParts)});
+    }
   }
 
   // No promotable pointer arguments.
   if (ArgsToPromote.empty() && ByValArgsToTransform.empty())
     return nullptr;
 
-  if (!areFunctionArgsABICompatible(
-          *F, TTI, ArgsToPromote, ByValArgsToTransform))
-    return nullptr;
-
   return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite);
 }
 
Index: llvm/test/Transforms/ArgumentPromotion/align.ll
===================================================================
--- llvm/test/Transforms/ArgumentPromotion/align.ll
+++ llvm/test/Transforms/ArgumentPromotion/align.ll
@@ -3,8 +3,8 @@
 
 define internal i32 @callee_must_exec(i32* %p) {
 ; CHECK-LABEL: define {{[^@]+}}@callee_must_exec
-; CHECK-SAME: (i32 [[P_VAL:%.*]]) {
-; CHECK-NEXT:    ret i32 [[P_VAL]]
+; CHECK-SAME: (i32 [[P_0_VAL:%.*]]) {
+; CHECK-NEXT:    ret i32 [[P_0_VAL]]
 ;
   %x = load i32, i32* %p, align 16
   ret i32 %x
@@ -23,10 +23,10 @@
 
 define internal i32 @callee_guaranteed_aligned(i1 %c, i32* %p) {
 ; CHECK-LABEL: define {{[^@]+}}@callee_guaranteed_aligned
-; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_VAL:%.*]]) {
+; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_0_VAL:%.*]]) {
 ; CHECK-NEXT:    br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    ret i32 [[P_VAL]]
+; CHECK-NEXT:    ret i32 [[P_0_VAL]]
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i32 -1
 ;
@@ -51,14 +51,15 @@
   ret void
 }
 
-; TODO: This should not be promoted, as the caller only guarantees that the
+; This should not be promoted, as the caller only guarantees that the
 ; pointer is dereferenceable, not that it is aligned.
 define internal i32 @callee_not_guaranteed_aligned(i1 %c, i32* %p) {
 ; CHECK-LABEL: define {{[^@]+}}@callee_not_guaranteed_aligned
-; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_VAL:%.*]]) {
+; CHECK-SAME: (i1 [[C:%.*]], i32* [[P:%.*]]) {
 ; CHECK-NEXT:    br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    ret i32 [[P_VAL]]
+; CHECK-NEXT:    [[X:%.*]] = load i32, i32* [[P]], align 16
+; CHECK-NEXT:    ret i32 [[X]]
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i32 -1
 ;
@@ -75,8 +76,7 @@
 define void @caller_not_guaranteed_aligned(i1 %c, i32* dereferenceable(4) %p) {
 ; CHECK-LABEL: define {{[^@]+}}@caller_not_guaranteed_aligned
 ; CHECK-SAME: (i1 [[C:%.*]], i32* dereferenceable(4) [[P:%.*]]) {
-; CHECK-NEXT:    [[P_VAL:%.*]] = load i32, i32* [[P]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @callee_not_guaranteed_aligned(i1 [[C]], i32 [[P_VAL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @callee_not_guaranteed_aligned(i1 [[C]], i32* [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call i32 @callee_not_guaranteed_aligned(i1 %c, i32* %p)
Index: llvm/test/Transforms/ArgumentPromotion/fp80.ll
===================================================================
--- llvm/test/Transforms/ArgumentPromotion/fp80.ll
+++ llvm/test/Transforms/ArgumentPromotion/fp80.ll
@@ -15,12 +15,18 @@
 define void @run() {
 ; CHECK-LABEL: define {{[^@]+}}@run() {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @UseLongDoubleUnsafely(%union.u* byval([[UNION_U:%.*]]) align 16 bitcast (%struct.s* @b to %union.u*))
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[UNION_U]], %union.u* bitcast (%struct.s* @b to %union.u*), i32 0, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %union.u* bitcast (%struct.s* @b to %union.u*) to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 10
+; CHECK-NEXT:    [[DOTVAL:%.*]] = load i8, i8* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @UseLongDoubleUnsafely(i8 [[DOTVAL]])
+; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[UNION_U:%.*]], %union.u* bitcast (%struct.s* @b to %union.u*), i32 0, i32 0
 ; CHECK-NEXT:    [[DOT0_VAL:%.*]] = load x86_fp80, x86_fp80* [[DOT0]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call x86_fp80 @UseLongDoubleSafely(x86_fp80 [[DOT0_VAL]])
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @AccessPaddingOfStruct(%struct.Foo* byval([[STRUCT_FOO:%.*]]) @a)
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @CaptureAStruct(%struct.Foo* byval([[STRUCT_FOO]]) @a)
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_fp80 @UseLongDoubleSafely(x86_fp80 [[DOT0_VAL]])
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_FOO:%.*]], %struct.Foo* @a, i64 0, i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to i64*
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i64, i64* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @AccessPaddingOfStruct(i64 [[A_VAL]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @CaptureAStruct(%struct.Foo* byval([[STRUCT_FOO]]) @a)
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -33,12 +39,9 @@
 
 define internal i8 @UseLongDoubleUnsafely(%union.u* byval(%union.u) align 16 %arg) {
 ; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleUnsafely
-; CHECK-SAME: (%union.u* byval([[UNION_U:%.*]]) align 16 [[ARG:%.*]]) {
+; CHECK-SAME: (i8 [[ARG_10_VAL:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast %union.u* [[ARG]] to %struct.s*
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.s* [[BITCAST]], i64 0, i32 2
-; CHECK-NEXT:    [[RESULT:%.*]] = load i8, i8* [[GEP]], align 1
-; CHECK-NEXT:    ret i8 [[RESULT]]
+; CHECK-NEXT:    ret i8 [[ARG_10_VAL]]
 ;
 entry:
   %bitcast = bitcast %union.u* %arg to %struct.s*
@@ -64,10 +67,8 @@
 
 define internal i64 @AccessPaddingOfStruct(%struct.Foo* byval(%struct.Foo) %a) {
 ; CHECK-LABEL: define {{[^@]+}}@AccessPaddingOfStruct
-; CHECK-SAME: (%struct.Foo* byval([[STRUCT_FOO:%.*]]) [[A:%.*]]) {
-; CHECK-NEXT:    [[P:%.*]] = bitcast %struct.Foo* [[A]] to i64*
-; CHECK-NEXT:    [[V:%.*]] = load i64, i64* [[P]], align 8
-; CHECK-NEXT:    ret i64 [[V]]
+; CHECK-SAME: (i64 [[A_0_VAL:%.*]]) {
+; CHECK-NEXT:    ret i64 [[A_0_VAL]]
 ;
   %p = bitcast %struct.Foo* %a to i64*
   %v = load i64, i64* %p
Index: llvm/test/Transforms/ArgumentPromotion/opaque-ptr.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/ArgumentPromotion/opaque-ptr.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes
+; RUN: opt -S -argpromotion -opaque-pointers < %s | FileCheck %s
+
+define internal i32 @callee_basic(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@callee_basic
+; CHECK-SAME: (i32 [[P_0_VAL:%.*]], i32 [[P_4_VAL:%.*]]) {
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[P_0_VAL]], [[P_4_VAL]]
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %x = load i32, ptr %p
+  %p1 = getelementptr i8, ptr %p, i64 4
+  %y = load i32, ptr %p1
+  %z = add i32 %x, %y
+  ret i32 %z
+}
+
+define void @caller_basic(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@caller_basic
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[P_VAL:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P]], i64 4
+; CHECK-NEXT:    [[P_VAL1:%.*]] = load i32, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @callee_basic(i32 [[P_VAL]], i32 [[P_VAL1]])
+; CHECK-NEXT:    ret void
+;
+  call i32 @callee_basic(ptr %p)
+  ret void
+}
+
+; Same offset is loaded with two differen types: Don't promote.
+define internal i32 @callee_different_types(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@callee_different_types
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[Y_F:%.*]] = load float, ptr [[P]], align 4
+; CHECK-NEXT:    [[Y:%.*]] = fptoui float [[Y_F]] to i32
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[X]], [[Y]]
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %x = load i32, ptr %p
+  %y.f = load float, ptr %p
+  %y = fptoui float %y.f to i32
+  %z = add i32 %x, %y
+  ret i32 %z
+}
+
+define void @caller_different_types(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@caller_different_types
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @callee_different_types(ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call i32 @callee_different_types(ptr %p)
+  ret void
+}
+
+; The two loads overlap: Don't promote.
+define internal i32 @callee_overlap(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@callee_overlap
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr [[P1]], align 4
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[X]], [[Y]]
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %x = load i32, ptr %p
+  %p1 = getelementptr i8, ptr %p, i64 2
+  %y = load i32, ptr %p1
+  %z = add i32 %x, %y
+  ret i32 %z
+}
+
+define void @caller_overlap(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@caller_overlap
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @callee_overlap(ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call i32 @callee_overlap(ptr %p)
+  ret void
+}