Index: lib/Analysis/ConstantFolding.cpp
===================================================================
--- lib/Analysis/ConstantFolding.cpp
+++ lib/Analysis/ConstantFolding.cpp
@@ -442,8 +442,8 @@
     return nullptr;
 
   GlobalValue *GVal;
-  APInt Offset;
-  if (!IsConstantOffsetFromGlobal(C, GVal, Offset, DL))
+  APInt OffsetAI;
+  if (!IsConstantOffsetFromGlobal(C, GVal, OffsetAI, DL))
     return nullptr;
 
   auto *GV = dyn_cast<GlobalVariable>(GVal);
@@ -451,19 +451,29 @@
       !GV->getInitializer()->getType()->isSized())
     return nullptr;
 
-  // If we're loading off the beginning of the global, some bytes may be valid,
-  // but we don't try to handle this.
-  if (Offset.isNegative())
-    return nullptr;
+  int64_t Offset = OffsetAI.getSExtValue();
+  int64_t InitializerSize = DL.getTypeAllocSize(GV->getInitializer()->getType());
+
+  // If we're not accessing anything in this constant, the result is undefined.
+  if (Offset + BytesLoaded <= 0)
+    return UndefValue::get(IntType);
 
   // If we're not accessing anything in this constant, the result is undefined.
-  if (Offset.getZExtValue() >=
-      DL.getTypeAllocSize(GV->getInitializer()->getType()))
+  if (Offset >= InitializerSize)
     return UndefValue::get(IntType);
 
   unsigned char RawBytes[32] = {0};
-  if (!ReadDataFromGlobal(GV->getInitializer(), Offset.getZExtValue(), RawBytes,
-                          BytesLoaded, DL))
+  unsigned char *CurPtr = RawBytes;
+  unsigned BytesLeft = BytesLoaded;
+
+  // If we're loading off the beginning of the global, some bytes may be valid.
+  if (Offset < 0) {
+    CurPtr += -Offset;
+    BytesLeft += Offset;
+    Offset = 0;
+  }
+
+  if (!ReadDataFromGlobal(GV->getInitializer(), Offset, CurPtr, BytesLeft, DL))
     return nullptr;
 
   APInt ResultVal = APInt(IntType->getBitWidth(), 0);
@@ -1289,6 +1299,7 @@
   case Intrinsic::fmuladd:
   case Intrinsic::copysign:
   case Intrinsic::round:
+  case Intrinsic::masked_load:
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::ssub_with_overflow:
@@ -1833,11 +1844,41 @@
 
 Constant *ConstantFoldVectorCall(StringRef Name, unsigned IntrinsicID,
                                  VectorType *VTy, ArrayRef<Constant *> Operands,
+                                 const DataLayout &DL,
                                  const TargetLibraryInfo *TLI) {
   SmallVector<Constant *, 4> Result(VTy->getNumElements());
   SmallVector<Constant *, 4> Lane(Operands.size());
   Type *Ty = VTy->getElementType();
 
+  if (IntrinsicID == Intrinsic::masked_load) {
+    auto *SrcPtr = Operands[0];
+    auto *Mask = Operands[2];
+    auto *Passthru = Operands[3];
+    Constant *VecData = ConstantFoldLoadFromConstPtr(SrcPtr, VTy, DL);
+
+    SmallVector<Constant *, 32> NewElements;
+    for (unsigned I = 0, E = VTy->getNumElements(); I != E; ++I) {
+      auto *MaskElt =
+          dyn_cast_or_null<ConstantInt>(Mask->getAggregateElement(I));
+      if (!MaskElt)
+        break;
+      if (MaskElt->isZero()) {
+        auto *PassthruElt = Passthru->getAggregateElement(I);
+        if (!PassthruElt)
+          break;
+        NewElements.push_back(PassthruElt);
+      } else {
+        assert(MaskElt->isOne());
+        auto *VecElt = VecData->getAggregateElement(I);
+        if (!VecElt)
+          break;
+        NewElements.push_back(VecElt);
+      }
+    }
+    if (NewElements.size() == VTy->getNumElements())
+      return ConstantVector::get(NewElements);
+  }
+
   for (unsigned I = 0, E = VTy->getNumElements(); I != E; ++I) {
     // Gather a column of constants.
     for (unsigned J = 0, JE = Operands.size(); J != JE; ++J) {
@@ -1870,7 +1911,8 @@
   Type *Ty = F->getReturnType();
 
   if (auto *VTy = dyn_cast<VectorType>(Ty))
-    return ConstantFoldVectorCall(Name, F->getIntrinsicID(), VTy, Operands, TLI);
+    return ConstantFoldVectorCall(Name, F->getIntrinsicID(), VTy, Operands,
+                                  F->getParent()->getDataLayout(), TLI);
 
   return ConstantFoldScalarCall(Name, F->getIntrinsicID(), Ty, Operands, TLI);
 }
Index: lib/Analysis/InstructionSimplify.cpp
===================================================================
--- lib/Analysis/InstructionSimplify.cpp
+++ lib/Analysis/InstructionSimplify.cpp
@@ -3991,6 +3991,15 @@
                                   Q.DL);
   }
 
+  // Simplify calls to llvm.masked.load.*
+  if (IID == Intrinsic::masked_load) {
+    IterTy MaskArg = ArgBegin + 2;
+    // If the mask is all zeros, the "passthru" argument is the result.
+    if (auto *ConstMask = dyn_cast<Constant>(*MaskArg))
+      if (ConstMask->isNullValue())
+        return ArgBegin[3];
+  }
+
   // Perform idempotent optimizations
   if (!IsIdempotent(IID))
     return nullptr;
Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1044,10 +1044,6 @@
   if (!ConstMask)
     return nullptr;
 
-  // If the mask is all zeros, the "passthru" argument is the result.
-  if (ConstMask->isNullValue())
-    return II.getArgOperand(3);
-
   // If the mask is all ones, this is a plain vector load of the 1st argument.
   if (ConstMask->isAllOnesValue()) {
     Value *LoadPtr = II.getArgOperand(0);
Index: test/Transforms/InstSimplify/call.ll
===================================================================
--- test/Transforms/InstSimplify/call.ll
+++ test/Transforms/InstSimplify/call.ll
@@ -204,4 +204,15 @@
 ; CHECK-LABEL: define i32 @call_undef(
 ; CHECK: ret i32 undef
 
+@GV = private constant [8 x i32] [i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49]
+
+define <8 x i32> @partial_masked_load() {
+; CHECK-LABEL: @partial_masked_load(
+; CHECK:         ret <8 x i32> <i32 undef, i32 undef, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+  %masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast (i32* getelementptr ([8 x i32], [8 x i32]* @GV, i64 0, i64 -2) to <8 x i32>*), i32 4, <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  ret <8 x i32> %masked.load
+}
+
 declare noalias i8* @malloc(i64)
+
+declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
Index: test/Transforms/InstSimplify/load.ll
===================================================================
--- test/Transforms/InstSimplify/load.ll
+++ test/Transforms/InstSimplify/load.ll
@@ -20,3 +20,11 @@
   ret i32 %load
 }
 
+@GV = private constant [8 x i32] [i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49]
+
+define <8 x i32> @partial_load() {
+; CHECK-LABEL: @partial_load(
+; CHECK:         ret <8 x i32> <i32 0, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48>
+  %load = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr ([8 x i32], [8 x i32]* @GV, i64 0, i64 -1) to <8 x i32>*)
+  ret <8 x i32> %load
+}