Index: llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
===================================================================
--- llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
+++ llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
@@ -44,9 +44,10 @@
 /// combine them.
 class LLVM_LIBRARY_VISIBILITY InstCombiner {
   /// Only used to call target specific inst combining.
-  TargetTransformInfo &TTI;
 
 public:
+  TargetTransformInfo &TTI;
+
   /// Maximum size of array considered when transforming.
   uint64_t MaxArraySizeForCombine = 0;
 
Index: llvm/lib/Transforms/InstCombine/InstCombineInternal.h
===================================================================
--- llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -188,6 +188,8 @@
 
   LoadInst *combineLoadToNewType(LoadInst &LI, Type *NewTy,
                                  const Twine &Suffix = "");
+  bool isFreeExt(Instruction *ExtI);
+  bool canFoldLoadIntoPHI(PHINode &PN, ArrayRef<Value *> IncLIs, Align &LoadAlignment);
 
 private:
   void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI);
Index: llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
===================================================================
--- llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -20,6 +20,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/InstructionCost.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -655,13 +657,16 @@
   return true;
 }
 
-Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
-  LoadInst *FirstLI = cast<LoadInst>(PN.getIncomingValue(0));
-
+// Returns TRUE if foldPHIArgLoadIntoPHI can fold the IncLIs Loads as arguments
+// into the PHI node PN.  If TRUE it will write the required Alignment for
+// the new Load into LoadAlignment.
+bool InstCombinerImpl::canFoldLoadIntoPHI(PHINode &PN, ArrayRef<Value *> IncLIs,
+                                          Align &LoadAlignment) {
+  auto FirstLI = dyn_cast<LoadInst>(IncLIs[0]);
   // FIXME: This is overconservative; this transform is allowed in some cases
   // for atomic operations.
   if (FirstLI->isAtomic())
-    return nullptr;
+    return false;
 
   // When processing loads, we need to propagate two bits of information to the
   // sunk load: whether it is volatile, and what its alignment is.  We currently
@@ -669,27 +674,27 @@
   // visitLoadInst will propagate an alignment onto the load when TD is around,
   // and if TD isn't around, we can't handle the mixed case.
   bool isVolatile = FirstLI->isVolatile();
-  Align LoadAlignment = FirstLI->getAlign();
+  LoadAlignment = FirstLI->getAlign();
   unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace();
 
   // We can't sink the load if the loaded value could be modified between the
   // load and the PHI.
   if (FirstLI->getParent() != PN.getIncomingBlock(0) ||
       !isSafeAndProfitableToSinkLoad(FirstLI))
-    return nullptr;
+    return false;
 
   // If the PHI is of volatile loads and the load block has multiple
   // successors, sinking it would remove a load of the volatile value from
   // the path through the other successor.
   if (isVolatile &&
       FirstLI->getParent()->getTerminator()->getNumSuccessors() != 1)
-    return nullptr;
+    return false;
 
   // Check to see if all arguments are the same operation.
   for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
-    LoadInst *LI = dyn_cast<LoadInst>(PN.getIncomingValue(i));
+    auto LI = dyn_cast<LoadInst>(IncLIs[i]);
     if (!LI || !LI->hasOneUser())
-      return nullptr;
+      return false;
 
     // We can't sink the load if the loaded value could be modified between
     // the load and the PHI.
@@ -697,24 +702,36 @@
         LI->getParent() != PN.getIncomingBlock(i) ||
         LI->getPointerAddressSpace() != LoadAddrSpace ||
         !isSafeAndProfitableToSinkLoad(LI))
-      return nullptr;
+      return false;
 
     LoadAlignment = std::min(LoadAlignment, Align(LI->getAlign()));
 
     // If the PHI is of volatile loads and the load block has multiple
     // successors, sinking it would remove a load of the volatile value from
     // the path through the other successor.
-    if (isVolatile &&
-        LI->getParent()->getTerminator()->getNumSuccessors() != 1)
-      return nullptr;
+    if (isVolatile && LI->getParent()->getTerminator()->getNumSuccessors() != 1)
+      return false;
   }
+  return true;
+}
+
+Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
+  Align LoadAlignment;
+  std::vector<Value *> IncLIs;
+  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+    IncLIs.push_back(PN.getIncomingValue(i));
 
+  if (!canFoldLoadIntoPHI(PN, ArrayRef<Value *>(IncLIs), LoadAlignment))
+    return nullptr;
+
+  auto FirstLI = cast<LoadInst> (PN.getIncomingValue(0));
   // Okay, they are all the same operation.  Create a new PHI node of the
   // correct type, and PHI together all of the LHS's of the instructions.
   PHINode *NewPN = PHINode::Create(FirstLI->getOperand(0)->getType(),
                                    PN.getNumIncomingValues(),
                                    PN.getName()+".in");
 
+  bool isVolatile = FirstLI->isVolatile();
   Value *InVal = FirstLI->getOperand(0);
   NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
   LoadInst *NewLI =
@@ -840,6 +857,39 @@
   return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType());
 }
 
+// Return true if I is a free extension. I.e. check whether the target can fold
+// the SExt or ZExt into the previous operation, currently only checks for
+// loads. Returns false otherwise.
+bool InstCombinerImpl::isFreeExt(Instruction *ExtI) {
+  // Check whether I is an Ext instruction.
+  if (ExtI->getOpcode() != Instruction::SExt &&
+      ExtI->getOpcode() != Instruction::ZExt)
+    return false;
+
+  // If it's not an Instruction it can't be a load.
+  auto *LoadI = dyn_cast<Instruction>(ExtI->getOperand(0));
+  if (!LoadI)
+    return false;
+
+  // Check whether we are dealing with a normal or a masked load.
+  bool masked = false;
+  if (isa<IntrinsicInst>(LoadI)) {
+    if (cast<IntrinsicInst>(LoadI)->getIntrinsicID() != Intrinsic::masked_load)
+      return false;
+    masked = true;
+  } else if (LoadI->getOpcode() != Instruction::Load)
+    return false;
+
+  // Check whether the target can merge the load and extend instructions.
+  InstructionCost Cost = TTI.getCastInstrCost(
+      ExtI->getOpcode(), ExtI->getType(), LoadI->getType(),
+      masked ? TTI::CastContextHint::Masked : TTI::CastContextHint::Normal,
+      TTI::TCK_SizeAndLatency, ExtI);
+
+  // Check whether the Cost of the Ext is valid and indeed 0, i.e. free
+  return Cost.isValid() && Cost == 0;
+}
+
 /// If all operands to a PHI node are the same "unary" operator and they all are
 /// only used by the PHI, PHI together their inputs, and do the operation once,
 /// to the result of the PHI.
@@ -887,19 +937,47 @@
     return nullptr;  // Cannot fold this operation.
   }
 
+  // We want to try to keep (SZ)Exts and Loads together when the Ext is free.
+  // OneIsFree is to check whether any of the Ext's feeding into the PHInode
+  // are free. AllAreFree check whether all of the Ext's are free.
+  bool OneIsFree = isFreeExt(FirstInst);
+  bool AllAreFree = OneIsFree;
+  std::vector<Value *> IncLIs;
+
+  if (OneIsFree)
+    IncLIs.push_back (FirstInst->getOperand (0));
+
   // Check to see if all arguments are the same operation.
   for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
     Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
     if (!I || !I->hasOneUser() || !I->isSameOperationAs(FirstInst))
       return nullptr;
+
+    if (isFreeExt(I)) {
+      OneIsFree = true;
+      IncLIs.push_back(I->getOperand(0));
+    } else
+      AllAreFree = false;
+
     if (CastSrcTy) {
       if (I->getOperand(0)->getType() != CastSrcTy)
-        return nullptr;  // Cast operation must match.
+        return nullptr; // Cast operation must match.
     } else if (I->getOperand(1) != ConstantOp) {
       return nullptr;
     }
   }
 
+  // If AllAreFree and we know we can't sink the Loads, then we should not sink
+  // the Exts either, if all are free, sinkable and the Loads can be sunk too,
+  // then we will want it to sink both Loads and Exts.
+  // If at least OneIsFree and we are not optimizing for size then we do not
+  // sink the Ext, since that would penalize the performance of the path(s)
+  // of the free Ext(s).
+  Align dummy;
+  if ((AllAreFree && !canFoldLoadIntoPHI(PN, IncLIs, dummy)) ||
+      ((!AllAreFree && OneIsFree && !MinimizeSize)))
+    return nullptr;
+
   // Okay, they are all the same operation.  Create a new PHI node of the
   // correct type, and PHI together all of the LHS's of the instructions.
   PHINode *NewPN = PHINode::Create(FirstInst->getOperand(0)->getType(),
Index: llvm/test/Transforms/InstCombine/AArch64/load_extend0.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/InstCombine/AArch64/load_extend0.ll
@@ -0,0 +1,50 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local i32 @foo(i8* %ip) local_unnamed_addr {
+; CHECK-LABEL: @foo(
+entry:
+  %0 = load i8, i8* %ip, align 1
+  %conv = zext i8 %0 to i64
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+; CHECK-LABEL:  do.body:
+; CHECK-NEXT:    %1 = phi i64 [ %conv, %entry ], [ %conv1, %do.body ]
+  %1 = phi i64 [ %conv, %entry ], [ %conv1, %do.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %ip, i64 %1
+  %2 = load i8, i8* %arrayidx, align 1
+  %conv1 = zext i8 %2 to i64
+  %cmp = icmp ult i64 %conv1, 100
+  br i1 %cmp, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  %conv3 = trunc i64 %conv1 to i32
+  ret i32 %conv3
+}
+
+define dso_local i64 @bar(i8* %ip) local_unnamed_addr {
+; CHECK-LABEL: @bar(
+entry:
+  %0 = load i8, i8* %ip, align 1
+  %conv = zext i8 %0 to i64
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+; CHECK-LABEL:  do.body:
+; CHECK-NEXT:    %1 = phi i64 [ %conv, %entry ], [ %conv1, %do.body ]
+  %1 = phi i64 [ %conv, %entry ], [ %conv1, %do.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %ip, i64 %1
+  %2 = load i8, i8* %arrayidx, align 1
+  call void @foobar(i8 %2)
+  %conv1 = zext i8 %2 to i64
+  %cmp = icmp ult i64 %conv1, 100
+  br i1 %cmp, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  ret i64 %conv1
+}
+
+declare dso_local void @foobar(i8) local_unnamed_addr
Index: llvm/test/Transforms/InstCombine/ARM/load_ext_combine.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/InstCombine/ARM/load_ext_combine.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-none-unknown-eabihf"
+
+; Function Attrs: nounwind
+define dso_local void @foo(i32* %a, i32* %b, i32 %n) local_unnamed_addr {
+; CHECK-LABEL: @foo(
+entry:
+  %0 = load i32, i32* %a, align 4
+  %conv = zext i32 %0 to i64
+  %1 = load i32, i32* %b, align 4
+  %conv2 = zext i32 %1 to i64
+  %sub = sub nsw i32 %n, 1
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+; CHECK-LABEL: do.body:
+; CHECK-NEXT: %i0.0.in = phi i32 [ %0, %entry ], [ %2, %do.body ]
+  %i0.0 = phi i64 [ %conv, %entry ], [ %conv3, %do.body ]
+  %n.addr.0 = phi i32 [ %sub, %entry ], [ %sub7, %do.body ]
+  %a.addr.0 = phi i32* [ %a, %entry ], [ %add.ptr6, %do.body ]
+  %mul = mul i64 %i0.0, %conv2
+  %add.ptr = getelementptr inbounds i32, i32* %a.addr.0, i32 1
+  %2 = load i32, i32* %add.ptr, align 4
+  %conv3 = zext i32 %2 to i64
+  %conv4 = trunc i64 %mul to i32
+  store i32 %conv4, i32* %a.addr.0, align 4
+  %add.ptr6 = getelementptr inbounds i32, i32* %a.addr.0, i32 8
+  %sub7 = sub nsw i32 %n.addr.0, 8
+  %cmp = icmp sgt i32 %sub7, 0
+  br i1 %cmp, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  ret void
+}
Index: llvm/test/Transforms/InstCombine/ARM/mve_ext_mask_loads.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/InstCombine/ARM/mve_ext_mask_loads.ll
@@ -0,0 +1,63 @@
+; RUN: opt < %s -instcombine -S -mattr=+mve | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-none-unknown-eabihf"
+
+define dso_local void @foo(i8* %a, i8* %b, i32 %n) local_unnamed_addr {
+; CHECK-LABEL: @foo(
+entry:
+  %0 = bitcast i8* %a to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %b to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i16>
+  %add.ptr = getelementptr inbounds i8, i8* %b, i32 8
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+; CHECK-LABEL:  do.body:
+; CHECK-NEXT:    %v0.0 = phi <8 x i16> [ %2, %entry ], [ %10, %do.body ]
+  %v0.0 = phi <8 x i16> [ %2, %entry ], [ %14, %do.body ]
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %sub, %do.body ]
+  %b.addr.0 = phi i8* [ %add.ptr, %entry ], [ %add.ptr2, %do.body ]
+  %a.addr.0 = phi i8* [ %a, %entry ], [ %add.ptr1, %do.body ]
+  %6 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n.addr.0)
+  %7 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %6)
+  %8 = trunc i32 %7 to i16
+  %9 = zext i16 %8 to i32
+  %10 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %9)
+  %11 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %v0.0, <8 x i16> %5, <8 x i1> %10, <8 x i16> undef)
+  %add.ptr1 = getelementptr inbounds i8, i8* %a.addr.0, i32 8
+  %12 = bitcast i8* %add.ptr1 to <8 x i8>*
+  %13 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %12, i32 1, <8 x i1> %10, <8 x i8> zeroinitializer)
+  %14 = zext <8 x i8> %13 to <8 x i16>
+  %15 = trunc <8 x i16> %11 to <8 x i8>
+  %16 = bitcast i8* %a.addr.0 to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %15, <8 x i8>* %16, i32 1, <8 x i1> %10)
+  %add.ptr2 = getelementptr inbounds i8, i8* %b.addr.0, i32 8
+  %sub = sub nsw i32 %n.addr.0, 8
+  %cmp = icmp sgt i32 %sub, 0
+  br i1 %cmp, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  ret void
+}
+
+; Function Attrs: nofree nosync nounwind readnone
+declare <8 x i1> @llvm.arm.mve.vctp16(i32)
+
+; Function Attrs: nofree nosync nounwind readnone
+declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>)
+
+; Function Attrs: nofree nosync nounwind readnone
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+
+; Function Attrs: nofree nosync nounwind readnone
+declare <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>)
+
+; Function Attrs: argmemonly mustprogress nofree nosync nounwind readonly willreturn
+declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>)
+
+; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn writeonly
+declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32 immarg, <8 x i1>)