diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -119,7 +119,7 @@
         ValueTy = lvalue.getType();
         ValueSizeInBits = C.getTypeSize(ValueTy);
         AtomicTy = ValueTy = CGF.getContext().getExtVectorType(
-            lvalue.getType(), cast<llvm::VectorType>(
+            lvalue.getType(), cast<llvm::FixedVectorType>(
                                   lvalue.getExtVectorAddress().getElementType())
                                   ->getNumElements());
         AtomicSizeInBits = C.getTypeSize(AtomicTy);
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4561,11 +4561,11 @@
                                    getTarget().getTriple().getArch());
 }
 
-static llvm::VectorType *GetNeonType(CodeGenFunction *CGF,
-                                     NeonTypeFlags TypeFlags,
-                                     bool HasLegalHalfType = true,
-                                     bool V1Ty = false,
-                                     bool AllowBFloatArgsAndRet = true) {
+static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF,
+                                          NeonTypeFlags TypeFlags,
+                                          bool HasLegalHalfType = true,
+                                          bool V1Ty = false,
+                                          bool AllowBFloatArgsAndRet = true) {
   int IsQuad = TypeFlags.isQuad();
   switch (TypeFlags.getEltType()) {
   case NeonTypeFlags::Int8:
@@ -5621,8 +5621,8 @@
   const bool AllowBFloatArgsAndRet =
       getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
 
-  llvm::VectorType *VTy = GetNeonType(this, Type, HasLegalHalfType, false,
-                                      AllowBFloatArgsAndRet);
+  llvm::FixedVectorType *VTy =
+      GetNeonType(this, Type, HasLegalHalfType, false, AllowBFloatArgsAndRet);
   llvm::Type *Ty = VTy;
   if (!Ty)
     return nullptr;
@@ -5663,8 +5663,8 @@
       return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs");
     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs");
   case NEON::BI__builtin_neon_vaddhn_v: {
-    llvm::VectorType *SrcTy =
-        llvm::VectorType::getExtendedElementVectorType(VTy);
+    llvm::FixedVectorType *SrcTy =
+        llvm::FixedVectorType::getExtendedElementVectorType(VTy);
 
     // %sum = add <4 x i32> %lhs, %rhs
     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
@@ -5936,14 +5936,16 @@
     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
   }
   case NEON::BI__builtin_neon_vmovl_v: {
-    llvm::Type *DTy =llvm::VectorType::getTruncatedElementVectorType(VTy);
+    llvm::FixedVectorType *DTy =
+        llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
     Ops[0] = Builder.CreateBitCast(Ops[0], DTy);
     if (Usgn)
       return Builder.CreateZExt(Ops[0], Ty, "vmovl");
     return Builder.CreateSExt(Ops[0], Ty, "vmovl");
   }
   case NEON::BI__builtin_neon_vmovn_v: {
-    llvm::Type *QTy = llvm::VectorType::getExtendedElementVectorType(VTy);
+    llvm::FixedVectorType *QTy =
+        llvm::FixedVectorType::getExtendedElementVectorType(VTy);
     Ops[0] = Builder.CreateBitCast(Ops[0], QTy);
     return Builder.CreateTrunc(Ops[0], Ty, "vmovn");
   }
@@ -5989,7 +5991,7 @@
   case NEON::BI__builtin_neon_vqdmulh_lane_v:
   case NEON::BI__builtin_neon_vqrdmulhq_lane_v:
   case NEON::BI__builtin_neon_vqrdmulh_lane_v: {
-    auto *RTy = cast<llvm::VectorType>(Ty);
+    auto *RTy = cast<llvm::FixedVectorType>(Ty);
     if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v ||
         BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v)
       RTy = llvm::FixedVectorType::get(RTy->getElementType(),
@@ -6038,7 +6040,8 @@
     return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1],
                              "vshl_n");
   case NEON::BI__builtin_neon_vshll_n_v: {
-    llvm::Type *SrcTy = llvm::VectorType::getTruncatedElementVectorType(VTy);
+    llvm::FixedVectorType *SrcTy =
+        llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
     if (Usgn)
       Ops[0] = Builder.CreateZExt(Ops[0], VTy);
@@ -6048,7 +6051,8 @@
     return Builder.CreateShl(Ops[0], Ops[1], "vshll_n");
   }
   case NEON::BI__builtin_neon_vshrn_n_v: {
-    llvm::Type *SrcTy = llvm::VectorType::getExtendedElementVectorType(VTy);
+    llvm::FixedVectorType *SrcTy =
+        llvm::FixedVectorType::getExtendedElementVectorType(VTy);
     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
     Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false);
     if (Usgn)
@@ -6097,8 +6101,8 @@
     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
   }
   case NEON::BI__builtin_neon_vsubhn_v: {
-    llvm::VectorType *SrcTy =
-        llvm::VectorType::getExtendedElementVectorType(VTy);
+    llvm::FixedVectorType *SrcTy =
+        llvm::FixedVectorType::getExtendedElementVectorType(VTy);
 
     // %sum = add <4 x i32> %lhs, %rhs
     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
@@ -6310,7 +6314,7 @@
 
   // Build a vector containing sequential number like (0, 1, 2, ..., 15)
   SmallVector<int, 16> Indices;
-  llvm::VectorType *TblTy = cast<llvm::VectorType>(Ops[0]->getType());
+  auto *TblTy = cast<llvm::FixedVectorType>(Ops[0]->getType());
   for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
     Indices.push_back(2*i);
     Indices.push_back(2*i+1);
@@ -7151,10 +7155,9 @@
   bool usgn = Type.isUnsigned();
   bool rightShift = false;
 
-  llvm::VectorType *VTy = GetNeonType(this, Type,
-                                      getTarget().hasLegalHalfType(),
-                                      false,
-                                      getTarget().hasBFloat16Type());
+  llvm::FixedVectorType *VTy =
+      GetNeonType(this, Type, getTarget().hasLegalHalfType(), false,
+                  getTarget().hasBFloat16Type());
   llvm::Type *Ty = VTy;
   if (!Ty)
     return nullptr;
@@ -7362,7 +7365,7 @@
   // or odds, as desired).
   SmallVector<int, 16> Indices;
   unsigned InputElements =
-      cast<llvm::VectorType>(V->getType())->getNumElements();
+      cast<llvm::FixedVectorType>(V->getType())->getNumElements();
   for (unsigned i = 0; i < InputElements; i += 2)
     Indices.push_back(i + Odd);
   return Builder.CreateShuffleVector(V, llvm::UndefValue::get(V->getType()),
@@ -7375,7 +7378,7 @@
   assert(V0->getType() == V1->getType() && "Can't zip different vector types");
   SmallVector<int, 16> Indices;
   unsigned InputElements =
-      cast<llvm::VectorType>(V0->getType())->getNumElements();
+      cast<llvm::FixedVectorType>(V0->getType())->getNumElements();
   for (unsigned i = 0; i < InputElements; i++) {
     Indices.push_back(i);
     Indices.push_back(i + InputElements);
@@ -7571,7 +7574,7 @@
 
   // Determine the type of this overloaded NEON intrinsic.
   NeonTypeFlags Type = Result->getZExtValue();
-  llvm::VectorType *Ty = GetNeonType(&CGF, Type);
+  llvm::FixedVectorType *Ty = GetNeonType(&CGF, Type);
   if (!Ty)
     return nullptr;
 
@@ -9773,7 +9776,7 @@
   }
   }
 
-  llvm::VectorType *VTy = GetNeonType(this, Type);
+  llvm::FixedVectorType *VTy = GetNeonType(this, Type);
   llvm::Type *Ty = VTy;
   if (!Ty)
     return nullptr;
@@ -9834,13 +9837,13 @@
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla");
   }
   case NEON::BI__builtin_neon_vfma_laneq_v: {
-    llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
+    auto *VTy = cast<llvm::FixedVectorType>(Ty);
     // v1f64 fma should be mapped to Neon scalar f64 fma
     if (VTy && VTy->getElementType() == DoubleTy) {
       Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
       Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
-      llvm::Type *VTy = GetNeonType(this,
-        NeonTypeFlags(NeonTypeFlags::Float64, false, true));
+      llvm::FixedVectorType *VTy =
+          GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, true));
       Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
       Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
       Value *Result;
@@ -10208,8 +10211,8 @@
     if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v)
       Quad = true;
     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
-    llvm::Type *VTy = GetNeonType(this,
-      NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
+    llvm::FixedVectorType *VTy =
+        GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
     Value *Result = Builder.CreateFMul(Ops[0], Ops[1]);
@@ -11081,7 +11084,8 @@
                                llvm::PointerType::getUnqual(Ops[1]->getType()));
 
   Value *MaskVec = getMaskVecValue(
-      CGF, Ops[2], cast<llvm::VectorType>(Ops[1]->getType())->getNumElements());
+      CGF, Ops[2],
+      cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements());
 
   return CGF.Builder.CreateMaskedStore(Ops[1], Ptr, Alignment, MaskVec);
 }
@@ -11093,7 +11097,8 @@
                                llvm::PointerType::getUnqual(Ops[1]->getType()));
 
   Value *MaskVec = getMaskVecValue(
-      CGF, Ops[2], cast<llvm::VectorType>(Ops[1]->getType())->getNumElements());
+      CGF, Ops[2],
+      cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements());
 
   return CGF.Builder.CreateMaskedLoad(Ptr, Alignment, MaskVec, Ops[1]);
 }
@@ -11107,7 +11112,8 @@
   Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
                                          llvm::PointerType::getUnqual(PtrTy));
 
-  Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
+  Value *MaskVec = getMaskVecValue(
+      CGF, Ops[2], cast<FixedVectorType>(ResultTy)->getNumElements());
 
   llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload,
                                            ResultTy);
@@ -11117,7 +11123,7 @@
 static Value *EmitX86CompressExpand(CodeGenFunction &CGF,
                                     ArrayRef<Value *> Ops,
                                     bool IsCompress) {
-  auto *ResultTy = cast<llvm::VectorType>(Ops[1]->getType());
+  auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
 
   Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
 
@@ -11129,7 +11135,7 @@
 
 static Value *EmitX86CompressStore(CodeGenFunction &CGF,
                                    ArrayRef<Value *> Ops) {
-  auto *ResultTy = cast<llvm::VectorType>(Ops[1]->getType());
+  auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
   llvm::Type *PtrTy = ResultTy->getElementType();
 
   // Cast the pointer to element type.
@@ -11165,7 +11171,7 @@
   // Funnel shifts amounts are treated as modulo and types are all power-of-2 so
   // we only care about the lowest log2 bits anyway.
   if (Amt->getType() != Ty) {
-    unsigned NumElts = cast<llvm::VectorType>(Ty)->getNumElements();
+    unsigned NumElts = cast<llvm::FixedVectorType>(Ty)->getNumElements();
     Amt = CGF.Builder.CreateIntCast(Amt, Ty->getScalarType(), false);
     Amt = CGF.Builder.CreateVectorSplat(NumElts, Amt);
   }
@@ -11224,7 +11230,7 @@
       return Op0;
 
   Mask = getMaskVecValue(
-      CGF, Mask, cast<llvm::VectorType>(Op0->getType())->getNumElements());
+      CGF, Mask, cast<llvm::FixedVectorType>(Op0->getType())->getNumElements());
 
   return CGF.Builder.CreateSelect(Mask, Op0, Op1);
 }
@@ -11271,7 +11277,7 @@
   assert((Ops.size() == 2 || Ops.size() == 4) &&
          "Unexpected number of arguments");
   unsigned NumElts =
-      cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+      cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
   Value *Cmp;
 
   if (CC == 3) {
@@ -11548,7 +11554,8 @@
 
 static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op,
                               llvm::Type *DstTy) {
-  unsigned NumberOfElements = cast<llvm::VectorType>(DstTy)->getNumElements();
+  unsigned NumberOfElements =
+      cast<llvm::FixedVectorType>(DstTy)->getNumElements();
   Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements);
   return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2");
 }
@@ -11584,11 +11591,12 @@
     return CGF.Builder.CreateCall(F, {Ops[0], Ops[1], Ops[2], Ops[3]});
   }
 
-  unsigned NumDstElts = cast<llvm::VectorType>(DstTy)->getNumElements();
+  unsigned NumDstElts = cast<llvm::FixedVectorType>(DstTy)->getNumElements();
   Value *Src = Ops[0];
 
   // Extract the subvector.
-  if (NumDstElts != cast<llvm::VectorType>(Src->getType())->getNumElements()) {
+  if (NumDstElts !=
+      cast<llvm::FixedVectorType>(Src->getType())->getNumElements()) {
     assert(NumDstElts == 4 && "Unexpected vector size");
     Src = CGF.Builder.CreateShuffleVector(Src, UndefValue::get(Src->getType()),
                                           ArrayRef<int>{0, 1, 2, 3});
@@ -11887,7 +11895,7 @@
   case X86::BI__builtin_ia32_vec_ext_v8si:
   case X86::BI__builtin_ia32_vec_ext_v4di: {
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     uint64_t Index = cast<ConstantInt>(Ops[1])->getZExtValue();
     Index &= NumElts - 1;
     // These builtins exist so we can ensure the index is an ICE and in range.
@@ -11903,7 +11911,7 @@
   case X86::BI__builtin_ia32_vec_set_v8si:
   case X86::BI__builtin_ia32_vec_set_v4di: {
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
     Index &= NumElts - 1;
     // These builtins exist so we can ensure the index is an ICE and in range.
@@ -12329,9 +12337,9 @@
       break;
     }
 
-    unsigned MinElts =
-        std::min(cast<llvm::VectorType>(Ops[0]->getType())->getNumElements(),
-                 cast<llvm::VectorType>(Ops[2]->getType())->getNumElements());
+    unsigned MinElts = std::min(
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(),
+        cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements());
     Ops[3] = getMaskVecValue(*this, Ops[3], MinElts);
     Function *Intr = CGM.getIntrinsic(IID);
     return Builder.CreateCall(Intr, Ops);
@@ -12438,9 +12446,9 @@
       break;
     }
 
-    unsigned MinElts =
-        std::min(cast<llvm::VectorType>(Ops[2]->getType())->getNumElements(),
-                 cast<llvm::VectorType>(Ops[3]->getType())->getNumElements());
+    unsigned MinElts = std::min(
+        cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements(),
+        cast<llvm::FixedVectorType>(Ops[3]->getType())->getNumElements());
     Ops[1] = getMaskVecValue(*this, Ops[1], MinElts);
     Function *Intr = CGM.getIntrinsic(IID);
     return Builder.CreateCall(Intr, Ops);
@@ -12462,10 +12470,10 @@
   case X86::BI__builtin_ia32_extracti64x2_256_mask:
   case X86::BI__builtin_ia32_extractf64x2_512_mask:
   case X86::BI__builtin_ia32_extracti64x2_512_mask: {
-    auto *DstTy = cast<llvm::VectorType>(ConvertType(E->getType()));
+    auto *DstTy = cast<llvm::FixedVectorType>(ConvertType(E->getType()));
     unsigned NumElts = DstTy->getNumElements();
     unsigned SrcNumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     unsigned SubVectors = SrcNumElts / NumElts;
     unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue();
     assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
@@ -12503,9 +12511,9 @@
   case X86::BI__builtin_ia32_insertf64x2_512:
   case X86::BI__builtin_ia32_inserti64x2_512: {
     unsigned DstNumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     unsigned SrcNumElts =
-        cast<llvm::VectorType>(Ops[1]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements();
     unsigned SubVectors = DstNumElts / SrcNumElts;
     unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
     assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
@@ -12570,7 +12578,7 @@
   case X86::BI__builtin_ia32_pblendd128:
   case X86::BI__builtin_ia32_pblendd256: {
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
 
     int Indices[16];
@@ -12587,7 +12595,7 @@
   case X86::BI__builtin_ia32_pshuflw256:
   case X86::BI__builtin_ia32_pshuflw512: {
     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
-    auto *Ty = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
     unsigned NumElts = Ty->getNumElements();
 
     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
@@ -12611,7 +12619,7 @@
   case X86::BI__builtin_ia32_pshufhw256:
   case X86::BI__builtin_ia32_pshufhw512: {
     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
-    auto *Ty = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
     unsigned NumElts = Ty->getNumElements();
 
     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
@@ -12641,7 +12649,7 @@
   case X86::BI__builtin_ia32_vpermilpd512:
   case X86::BI__builtin_ia32_vpermilps512: {
     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
-    auto *Ty = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
     unsigned NumElts = Ty->getNumElements();
     unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
     unsigned NumLaneElts = NumElts / NumLanes;
@@ -12668,7 +12676,7 @@
   case X86::BI__builtin_ia32_shufps256:
   case X86::BI__builtin_ia32_shufps512: {
     uint32_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
-    auto *Ty = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
     unsigned NumElts = Ty->getNumElements();
     unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
     unsigned NumLaneElts = NumElts / NumLanes;
@@ -12696,7 +12704,7 @@
   case X86::BI__builtin_ia32_permdi512:
   case X86::BI__builtin_ia32_permdf512: {
     unsigned Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
-    auto *Ty = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
     unsigned NumElts = Ty->getNumElements();
 
     // These intrinsics operate on 256-bit lanes of four 64-bit elements.
@@ -12715,7 +12723,7 @@
     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
 
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     assert(NumElts % 16 == 0);
 
     // If palignr is shifting the pair of vectors more than the size of two
@@ -12753,7 +12761,7 @@
   case X86::BI__builtin_ia32_alignq256:
   case X86::BI__builtin_ia32_alignq512: {
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
 
     // Mask the shift amount to width of two vectors.
@@ -12776,7 +12784,7 @@
   case X86::BI__builtin_ia32_shuf_i32x4:
   case X86::BI__builtin_ia32_shuf_i64x2: {
     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
-    auto *Ty = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
     unsigned NumElts = Ty->getNumElements();
     unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2;
     unsigned NumLaneElts = NumElts / NumLanes;
@@ -12803,7 +12811,7 @@
   case X86::BI__builtin_ia32_permti256: {
     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
 
     // This takes a very simple approach since there are two lanes and a
     // shuffle can have 2 inputs. So we reserve the first input for the first
@@ -12841,7 +12849,7 @@
   case X86::BI__builtin_ia32_pslldqi256_byteshift:
   case X86::BI__builtin_ia32_pslldqi512_byteshift: {
     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
-    auto *ResultType = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
     // Builtin type is vXi64 so multiply by 8 to get bytes.
     unsigned NumElts = ResultType->getNumElements() * 8;
 
@@ -12871,7 +12879,7 @@
   case X86::BI__builtin_ia32_psrldqi256_byteshift:
   case X86::BI__builtin_ia32_psrldqi512_byteshift: {
     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
-    auto *ResultType = cast<llvm::VectorType>(Ops[0]->getType());
+    auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
     // Builtin type is vXi64 so multiply by 8 to get bytes.
     unsigned NumElts = ResultType->getNumElements() * 8;
 
@@ -13518,7 +13526,7 @@
   case X86::BI__builtin_ia32_fpclasspd256_mask:
   case X86::BI__builtin_ia32_fpclasspd512_mask: {
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     Value *MaskIn = Ops[2];
     Ops.erase(&Ops[2]);
 
@@ -13556,7 +13564,7 @@
   case X86::BI__builtin_ia32_vp2intersect_d_256:
   case X86::BI__builtin_ia32_vp2intersect_d_128: {
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     Intrinsic::ID ID;
 
     switch (BuiltinID) {
@@ -13615,7 +13623,7 @@
   case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
   case X86::BI__builtin_ia32_vpshufbitqmb512_mask: {
     unsigned NumElts =
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
     Value *MaskIn = Ops[2];
     Ops.erase(&Ops[2]);
 
@@ -13762,7 +13770,7 @@
       Function *Intr = CGM.getIntrinsic(IID);
       if (IsMaskFCmp) {
         unsigned NumElts =
-            cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+            cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
         Ops[3] = getMaskVecValue(*this, Ops[3], NumElts);
         Value *Cmp = Builder.CreateCall(Intr, Ops);
         return EmitX86MaskedCompareResult(*this, Cmp, NumElts, nullptr);
@@ -13777,7 +13785,7 @@
       // We ignore SAE if strict FP is disabled. We only keep precise
       // exception behavior under strict FP.
       unsigned NumElts =
-          cast<llvm::VectorType>(Ops[0]->getType())->getNumElements();
+          cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
       Value *Cmp;
       if (IsSignaling)
         Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]);
@@ -13835,7 +13843,7 @@
   case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: {
     Ops[2] = getMaskVecValue(
         *this, Ops[2],
-        cast<llvm::VectorType>(Ops[0]->getType())->getNumElements());
+        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements());
     Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128;
     return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
   }
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1685,7 +1685,7 @@
     if (Ty->isVectorType()) {
       const llvm::Type *EltTy = Addr.getElementType();
 
-      const auto *VTy = cast<llvm::VectorType>(EltTy);
+      const auto *VTy = cast<llvm::FixedVectorType>(EltTy);
 
       // Handle vectors of size 3 like size 4 for better performance.
       if (VTy->getNumElements() == 3) {
@@ -1770,8 +1770,9 @@
   auto *VectorTy = dyn_cast<llvm::VectorType>(
       cast<llvm::PointerType>(Addr.getPointer()->getType())->getElementType());
   if (VectorTy && !IsVector) {
-    auto *ArrayTy = llvm::ArrayType::get(VectorTy->getElementType(),
-                                         VectorTy->getNumElements());
+    auto *ArrayTy = llvm::ArrayType::get(
+        VectorTy->getElementType(),
+        cast<llvm::FixedVectorType>(VectorTy)->getNumElements());
 
     return Address(CGF.Builder.CreateElementBitCast(Addr, ArrayTy));
   }
@@ -1802,7 +1803,7 @@
       llvm::Type *SrcTy = Value->getType();
       auto *VecTy = dyn_cast<llvm::VectorType>(SrcTy);
       // Handle vec3 special.
-      if (VecTy && VecTy->getNumElements() == 3) {
+      if (VecTy && cast<llvm::FixedVectorType>(VecTy)->getNumElements() == 3) {
         // Our source is a vec3, do a shuffle vector to make it a vec4.
         Value = Builder.CreateShuffleVector(Value, llvm::UndefValue::get(VecTy),
                                             ArrayRef<int>{0, 1, 2, -1},
@@ -2217,7 +2218,7 @@
   if (const VectorType *VTy = Dst.getType()->getAs<VectorType>()) {
     unsigned NumSrcElts = VTy->getNumElements();
     unsigned NumDstElts =
-        cast<llvm::VectorType>(Vec->getType())->getNumElements();
+        cast<llvm::FixedVectorType>(Vec->getType())->getNumElements();
     if (NumDstElts == NumSrcElts) {
       // Use shuffle vector is the src and destination are the same number of
       // elements and restore the vector mask since it is on the side it will be
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -1320,7 +1320,7 @@
            "Splatted expr doesn't match with vector element type?");
 
     // Splat the element across to all elements
-    unsigned NumElements = cast<llvm::VectorType>(DstTy)->getNumElements();
+    unsigned NumElements = cast<llvm::FixedVectorType>(DstTy)->getNumElements();
     return Builder.CreateVectorSplat(NumElements, Src, "splat");
   }
 
@@ -1553,12 +1553,12 @@
     Value *RHS = CGF.EmitScalarExpr(E->getExpr(1));
     Value *Mask;
 
-    llvm::VectorType *LTy = cast<llvm::VectorType>(LHS->getType());
+    auto *LTy = cast<llvm::FixedVectorType>(LHS->getType());
     unsigned LHSElts = LTy->getNumElements();
 
     Mask = RHS;
 
-    llvm::VectorType *MTy = cast<llvm::VectorType>(Mask->getType());
+    auto *MTy = cast<llvm::FixedVectorType>(Mask->getType());
 
     // Mask off the high bits of each shuffle index.
     Value *MaskBits =
@@ -1763,7 +1763,7 @@
     return Visit(E->getInit(0));
   }
 
-  unsigned ResElts = VType->getNumElements();
+  unsigned ResElts = cast<llvm::FixedVectorType>(VType)->getNumElements();
 
   // Loop over initializers collecting the Value for each, and remembering
   // whether the source was swizzle (ExtVectorElementExpr).  This will allow
@@ -1787,7 +1787,8 @@
       if (isa<ExtVectorElementExpr>(IE)) {
         llvm::ExtractElementInst *EI = cast<llvm::ExtractElementInst>(Init);
 
-        if (EI->getVectorOperandType()->getNumElements() == ResElts) {
+        if (cast<llvm::FixedVectorType>(EI->getVectorOperandType())
+                ->getNumElements() == ResElts) {
           llvm::ConstantInt *C = cast<llvm::ConstantInt>(EI->getIndexOperand());
           Value *LHS = nullptr, *RHS = nullptr;
           if (CurIdx == 0) {
@@ -1825,7 +1826,7 @@
       continue;
     }
 
-    unsigned InitElts = VVT->getNumElements();
+    unsigned InitElts = cast<llvm::FixedVectorType>(VVT)->getNumElements();
 
     // If the initializer is an ExtVecEltExpr (a swizzle), and the swizzle's
     // input is the same width as the vector being constructed, generate an
@@ -1834,7 +1835,7 @@
     if (isa<ExtVectorElementExpr>(IE)) {
       llvm::ShuffleVectorInst *SVI = cast<llvm::ShuffleVectorInst>(Init);
       Value *SVOp = SVI->getOperand(0);
-      llvm::VectorType *OpTy = cast<llvm::VectorType>(SVOp->getType());
+      auto *OpTy = cast<llvm::FixedVectorType>(SVOp->getType());
 
       if (OpTy->getNumElements() == ResElts) {
         for (unsigned j = 0; j != CurIdx; ++j) {
@@ -2170,7 +2171,7 @@
     llvm::Type *DstTy = ConvertType(DestTy);
     Value *Elt = Visit(const_cast<Expr*>(E));
     // Splat the element across to all elements
-    unsigned NumElements = cast<llvm::VectorType>(DstTy)->getNumElements();
+    unsigned NumElements = cast<llvm::FixedVectorType>(DstTy)->getNumElements();
     return Builder.CreateVectorSplat(NumElements, Elt, "splat");
   }
 
@@ -4331,7 +4332,7 @@
     llvm::Value *RHS = Visit(rhsExpr);
 
     llvm::Type *condType = ConvertType(condExpr->getType());
-    llvm::VectorType *vecTy = cast<llvm::VectorType>(condType);
+    auto *vecTy = cast<llvm::FixedVectorType>(condType);
 
     unsigned numElem = vecTy->getNumElements();
     llvm::Type *elemType = vecTy->getElementType();
@@ -4534,10 +4535,14 @@
   llvm::Type *DstTy = ConvertType(E->getType());
 
   llvm::Type *SrcTy = Src->getType();
-  unsigned NumElementsSrc = isa<llvm::VectorType>(SrcTy) ?
-    cast<llvm::VectorType>(SrcTy)->getNumElements() : 0;
-  unsigned NumElementsDst = isa<llvm::VectorType>(DstTy) ?
-    cast<llvm::VectorType>(DstTy)->getNumElements() : 0;
+  unsigned NumElementsSrc =
+      isa<llvm::VectorType>(SrcTy)
+          ? cast<llvm::FixedVectorType>(SrcTy)->getNumElements()
+          : 0;
+  unsigned NumElementsDst =
+      isa<llvm::VectorType>(DstTy)
+          ? cast<llvm::FixedVectorType>(DstTy)->getNumElements()
+          : 0;
 
   // Going from vec3 to non-vec3 is a special case and requires a shuffle
   // vector to get a vec4, then a bitcast if the target type is different.
diff --git a/clang/lib/CodeGen/SwiftCallingConv.cpp b/clang/lib/CodeGen/SwiftCallingConv.cpp
--- a/clang/lib/CodeGen/SwiftCallingConv.cpp
+++ b/clang/lib/CodeGen/SwiftCallingConv.cpp
@@ -320,9 +320,12 @@
   // If we have a vector type, split it.
   if (auto vecTy = dyn_cast_or_null<llvm::VectorType>(type)) {
     auto eltTy = vecTy->getElementType();
-    CharUnits eltSize = (end - begin) / vecTy->getNumElements();
+    CharUnits eltSize =
+        (end - begin) / cast<llvm::FixedVectorType>(vecTy)->getNumElements();
     assert(eltSize == getTypeStoreSize(CGM, eltTy));
-    for (unsigned i = 0, e = vecTy->getNumElements(); i != e; ++i) {
+    for (unsigned i = 0,
+                  e = cast<llvm::FixedVectorType>(vecTy)->getNumElements();
+         i != e; ++i) {
       addEntry(eltTy, begin, begin + eltSize);
       begin += eltSize;
     }
@@ -674,8 +677,9 @@
 
 bool swiftcall::isLegalVectorType(CodeGenModule &CGM, CharUnits vectorSize,
                                   llvm::VectorType *vectorTy) {
-  return isLegalVectorType(CGM, vectorSize, vectorTy->getElementType(),
-                           vectorTy->getNumElements());
+  return isLegalVectorType(
+      CGM, vectorSize, vectorTy->getElementType(),
+      cast<llvm::FixedVectorType>(vectorTy)->getNumElements());
 }
 
 bool swiftcall::isLegalVectorType(CodeGenModule &CGM, CharUnits vectorSize,
@@ -688,7 +692,7 @@
 std::pair<llvm::Type*, unsigned>
 swiftcall::splitLegalVectorType(CodeGenModule &CGM, CharUnits vectorSize,
                                 llvm::VectorType *vectorTy) {
-  auto numElts = vectorTy->getNumElements();
+  auto numElts = cast<llvm::FixedVectorType>(vectorTy)->getNumElements();
   auto eltTy = vectorTy->getElementType();
 
   // Try to split the vector type in half.
@@ -710,7 +714,7 @@
   }
 
   // Try to split the vector into legal subvectors.
-  auto numElts = origVectorTy->getNumElements();
+  auto numElts = cast<llvm::FixedVectorType>(origVectorTy)->getNumElements();
   auto eltTy = origVectorTy->getElementType();
   assert(numElts != 1);
 
diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -342,6 +342,7 @@
 raw_ostream &operator<<(raw_ostream &OS, const SampleRecord &Sample);
 
 class FunctionSamples;
+class SampleProfileReaderItaniumRemapper;
 
 using BodySampleMap = std::map<LineLocation, SampleRecord>;
 // NOTE: Using a StringMap here makes parsed profiles consume around 17% more
@@ -428,35 +429,15 @@
     return &iter->second;
   }
 
-  /// Returns a pointer to FunctionSamples at the given callsite location \p Loc
-  /// with callee \p CalleeName. If no callsite can be found, relax the
-  /// restriction to return the FunctionSamples at callsite location \p Loc
-  /// with the maximum total sample count.
-  const FunctionSamples *findFunctionSamplesAt(const LineLocation &Loc,
-                                               StringRef CalleeName) const {
-    std::string CalleeGUID;
-    CalleeName = getRepInFormat(CalleeName, UseMD5, CalleeGUID);
-
-    auto iter = CallsiteSamples.find(Loc);
-    if (iter == CallsiteSamples.end())
-      return nullptr;
-    auto FS = iter->second.find(CalleeName);
-    if (FS != iter->second.end())
-      return &FS->second;
-    // If we cannot find exact match of the callee name, return the FS with
-    // the max total count. Only do this when CalleeName is not provided,
-    // i.e., only for indirect calls.
-    if (!CalleeName.empty())
-      return nullptr;
-    uint64_t MaxTotalSamples = 0;
-    const FunctionSamples *R = nullptr;
-    for (const auto &NameFS : iter->second)
-      if (NameFS.second.getTotalSamples() >= MaxTotalSamples) {
-        MaxTotalSamples = NameFS.second.getTotalSamples();
-        R = &NameFS.second;
-      }
-    return R;
-  }
+  /// Returns a pointer to FunctionSamples at the given callsite location
+  /// \p Loc with callee \p CalleeName. If no callsite can be found, relax
+  /// the restriction to return the FunctionSamples at callsite location
+  /// \p Loc with the maximum total sample count. If \p Remapper is not
+  /// nullptr, use \p Remapper to find FunctionSamples with equivalent name
+  /// as \p CalleeName.
+  const FunctionSamples *
+  findFunctionSamplesAt(const LineLocation &Loc, StringRef CalleeName,
+                        SampleProfileReaderItaniumRemapper *Remapper) const;
 
   bool empty() const { return TotalSamples == 0; }
 
@@ -630,7 +611,11 @@
   /// tree nodes in the profile.
   ///
   /// \returns the FunctionSamples pointer to the inlined instance.
-  const FunctionSamples *findFunctionSamples(const DILocation *DIL) const;
+  /// If \p Remapper is not nullptr, it will be used to find matching
+  /// FunctionSamples with not exactly the same but equivalent name.
+  const FunctionSamples *findFunctionSamples(
+      const DILocation *DIL,
+      SampleProfileReaderItaniumRemapper *Remapper = nullptr) const;
 
   static SampleProfileFormat Format;
 
@@ -648,6 +633,10 @@
     return UseMD5 ? std::stoull(Name.data()) : Function::getGUID(Name);
   }
 
+  // Find all the names in the current FunctionSamples including names in
+  // all the inline instances and names of call targets.
+  void findAllNames(DenseSet<StringRef> &NameSet) const;
+
 private:
   /// Mangled name of the function.
   StringRef Name;
diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -208,6 +208,7 @@
 #ifndef LLVM_PROFILEDATA_SAMPLEPROFREADER_H
 #define LLVM_PROFILEDATA_SAMPLEPROFREADER_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -275,15 +276,18 @@
     return Remappings->lookup(FunctionName);
   }
 
-  /// Return the samples collected for function \p F if remapper knows
-  /// it is present in SampleMap.
-  FunctionSamples *getSamplesFor(StringRef FunctionName);
+  /// Return the equivalent name in the profile for \p FunctionName if
+  /// it exists.
+  Optional<StringRef> lookUpNameInProfile(StringRef FunctionName);
 
 private:
   // The buffer holding the content read from remapping file.
   std::unique_ptr<MemoryBuffer> Buffer;
   std::unique_ptr<SymbolRemappingReader> Remappings;
-  DenseMap<SymbolRemappingReader::Key, FunctionSamples *> SampleMap;
+  // Map remapping key to the name in the profile. By looking up the
+  // key in the remapper, a given new name can be mapped to the
+  // cannonical name using the NameMap.
+  DenseMap<SymbolRemappingReader::Key, StringRef> NameMap;
   // The Reader the remapper is servicing.
   SampleProfileReader &Reader;
   // Indicate whether remapping has been applied to the profile read
@@ -370,15 +374,19 @@
 
   /// Return the samples collected for function \p F.
   virtual FunctionSamples *getSamplesFor(StringRef Fname) {
-    if (Remapper) {
-      if (auto FS = Remapper->getSamplesFor(Fname))
-        return FS;
-    }
     std::string FGUID;
     Fname = getRepInFormat(Fname, useMD5(), FGUID);
     auto It = Profiles.find(Fname);
     if (It != Profiles.end())
       return &It->second;
+
+    if (Remapper) {
+      if (auto NameInProfile = Remapper->lookUpNameInProfile(Fname)) {
+        auto It = Profiles.find(*NameInProfile);
+        if (It != Profiles.end())
+          return &It->second;
+      }
+    }
     return nullptr;
   }
 
@@ -423,6 +431,8 @@
   /// Return whether names in the profile are all MD5 numbers.
   virtual bool useMD5() { return false; }
 
+  SampleProfileReaderItaniumRemapper *getRemapper() { return Remapper.get(); }
+
 protected:
   /// Map every function to its associated profile.
   ///
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4302,7 +4302,7 @@
   auto *ValC = dyn_cast<Constant>(Val);
   auto *IdxC = dyn_cast<Constant>(Idx);
   if (VecC && ValC && IdxC)
-    return ConstantFoldInsertElementInstruction(VecC, ValC, IdxC);
+    return ConstantExpr::getInsertElement(VecC, ValC, IdxC);
 
   // For fixed-length vector, fold into undef if index is out of bounds.
   if (auto *CI = dyn_cast<ConstantInt>(Idx)) {
@@ -4367,7 +4367,7 @@
   auto *VecVTy = cast<VectorType>(Vec->getType());
   if (auto *CVec = dyn_cast<Constant>(Vec)) {
     if (auto *CIdx = dyn_cast<Constant>(Idx))
-      return ConstantFoldExtractElementInstruction(CVec, CIdx);
+      return ConstantExpr::getExtractElement(CVec, CIdx);
 
     // The index is not relevant if our vector is a splat.
     if (auto *Splat = CVec->getSplatValue())
@@ -4565,8 +4565,8 @@
   // If all operands are constant, constant fold the shuffle. This
   // transformation depends on the value of the mask which is not known at
   // compile time for scalable vectors
-  if (!Scalable && Op0Const && Op1Const)
-    return ConstantFoldShuffleVectorInstruction(Op0Const, Op1Const, Mask);
+  if (Op0Const && Op1Const)
+    return ConstantExpr::getShuffleVector(Op0Const, Op1Const, Mask);
 
   // Canonicalization: if only one input vector is constant, it shall be the
   // second one. This transformation depends on the value of the mask which
diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp
--- a/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -466,8 +466,6 @@
 
   internalizeModule(*MergedModule, mustPreserveGV);
 
-  MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
-
   ScopeRestrictionsDone = true;
 }
 
@@ -559,6 +557,9 @@
   // Mark which symbols can not be internalized
   this->applyScopeRestrictions();
 
+  // Write LTOPostLink flag for passes that require all the modules.
+  MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
+
   // Instantiate the pass manager to organize the passes.
   legacy::PassManager passes;
 
diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp
--- a/llvm/lib/ProfileData/SampleProf.cpp
+++ b/llvm/lib/ProfileData/SampleProf.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
@@ -174,8 +175,8 @@
       0xffff;
 }
 
-const FunctionSamples *
-FunctionSamples::findFunctionSamples(const DILocation *DIL) const {
+const FunctionSamples *FunctionSamples::findFunctionSamples(
+    const DILocation *DIL, SampleProfileReaderItaniumRemapper *Remapper) const {
   assert(DIL);
   SmallVector<std::pair<LineLocation, StringRef>, 10> S;
 
@@ -190,11 +191,59 @@
     return this;
   const FunctionSamples *FS = this;
   for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) {
-    FS = FS->findFunctionSamplesAt(S[i].first, S[i].second);
+    FS = FS->findFunctionSamplesAt(S[i].first, S[i].second, Remapper);
   }
   return FS;
 }
 
+void FunctionSamples::findAllNames(DenseSet<StringRef> &NameSet) const {
+  NameSet.insert(Name);
+  for (const auto &BS : BodySamples)
+    for (const auto &TS : BS.second.getCallTargets())
+      NameSet.insert(TS.getKey());
+
+  for (const auto &CS : CallsiteSamples) {
+    for (const auto &NameFS : CS.second) {
+      NameSet.insert(NameFS.first);
+      NameFS.second.findAllNames(NameSet);
+    }
+  }
+}
+
+const FunctionSamples *FunctionSamples::findFunctionSamplesAt(
+    const LineLocation &Loc, StringRef CalleeName,
+    SampleProfileReaderItaniumRemapper *Remapper) const {
+  std::string CalleeGUID;
+  CalleeName = getRepInFormat(CalleeName, UseMD5, CalleeGUID);
+
+  auto iter = CallsiteSamples.find(Loc);
+  if (iter == CallsiteSamples.end())
+    return nullptr;
+  auto FS = iter->second.find(CalleeName);
+  if (FS != iter->second.end())
+    return &FS->second;
+  if (Remapper) {
+    if (auto NameInProfile = Remapper->lookUpNameInProfile(CalleeName)) {
+      auto FS = iter->second.find(*NameInProfile);
+      if (FS != iter->second.end())
+        return &FS->second;
+    }
+  }
+  // If we cannot find exact match of the callee name, return the FS with
+  // the max total count. Only do this when CalleeName is not provided,
+  // i.e., only for indirect calls.
+  if (!CalleeName.empty())
+    return nullptr;
+  uint64_t MaxTotalSamples = 0;
+  const FunctionSamples *R = nullptr;
+  for (const auto &NameFS : iter->second)
+    if (NameFS.second.getTotalSamples() >= MaxTotalSamples) {
+      MaxTotalSamples = NameFS.second.getTotalSamples();
+      R = &NameFS.second;
+    }
+  return R;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void FunctionSamples::dump() const { print(dbgs(), 0); }
 #endif
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -1291,18 +1291,22 @@
   }
 
   assert(Remappings && "should be initialized while creating remapper");
-  for (auto &Sample : Reader.getProfiles())
-    if (auto Key = Remappings->insert(Sample.first()))
-      SampleMap.insert({Key, &Sample.second});
+  for (auto &Sample : Reader.getProfiles()) {
+    DenseSet<StringRef> NamesInSample;
+    Sample.second.findAllNames(NamesInSample);
+    for (auto &Name : NamesInSample)
+      if (auto Key = Remappings->insert(Name))
+        NameMap.insert({Key, Name});
+  }
 
   RemappingApplied = true;
 }
 
-FunctionSamples *
-SampleProfileReaderItaniumRemapper::getSamplesFor(StringRef Fname) {
+Optional<StringRef>
+SampleProfileReaderItaniumRemapper::lookUpNameInProfile(StringRef Fname) {
   if (auto Key = Remappings->lookup(Fname))
-    return SampleMap.lookup(Key);
-  return nullptr;
+    return NameMap.lookup(Key);
+  return None;
 }
 
 /// Prepare a memory buffer for the contents of \p Filename.
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -5487,7 +5487,7 @@
   def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $imm5]",
                   (!cast<Instruction>(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
-                  (!cast<Instruction>(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
+                  (!cast<Instruction>(NAME # _IMM) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
 
   def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt),
             (!cast<Instruction>(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1304,9 +1304,9 @@
   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
     unsigned OffsetValue0 = CAddr->getZExtValue() / Align;
     unsigned OffsetValue1 = OffsetValue0 + 1;
-    assert(Align * OffsetValue0 == CAddr->getZExtValue());
+    bool OffsetIsAligned = Align * OffsetValue0 == CAddr->getZExtValue();
 
-    if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1)) {
+    if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1) && OffsetIsAligned) {
       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
       MachineSDNode *MovZero =
           CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -469,8 +469,7 @@
     SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxShift(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxStore(SDValue Op, SelectionDAG &DAG) const;
-    SDValue HvxVecPredBitcastComputation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const;
     SDValue SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -97,6 +97,8 @@
 
     setOperationAction(ISD::CTTZ,               T, Custom);
     setOperationAction(ISD::LOAD,               T, Custom);
+    setOperationAction(ISD::MLOAD,              T, Custom);
+    setOperationAction(ISD::MSTORE,             T, Custom);
     setOperationAction(ISD::MUL,                T, Custom);
     setOperationAction(ISD::MULHS,              T, Custom);
     setOperationAction(ISD::MULHU,              T, Custom);
@@ -150,6 +152,8 @@
 
     setOperationAction(ISD::LOAD,     T, Custom);
     setOperationAction(ISD::STORE,    T, Custom);
+    setOperationAction(ISD::MLOAD,    T, Custom);
+    setOperationAction(ISD::MSTORE,   T, Custom);
     setOperationAction(ISD::CTLZ,     T, Custom);
     setOperationAction(ISD::CTTZ,     T, Custom);
     setOperationAction(ISD::CTPOP,    T, Custom);
@@ -188,6 +192,9 @@
     setOperationAction(ISD::AND,                BoolW, Custom);
     setOperationAction(ISD::OR,                 BoolW, Custom);
     setOperationAction(ISD::XOR,                BoolW, Custom);
+    // Masked load/store takes a mask that may need splitting.
+    setOperationAction(ISD::MLOAD,              BoolW, Custom);
+    setOperationAction(ISD::MSTORE,             BoolW, Custom);
   }
 
   for (MVT T : LegalV) {
@@ -1593,7 +1600,7 @@
 
 SDValue
 HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const {
-  const SDLoc &dl(Op);
+      const SDLoc &dl(Op);
   MVT ResTy = ty(Op);
 
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -1613,6 +1620,75 @@
   return Op;
 }
 
+SDValue
+HexagonTargetLowering::LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const {
+  const SDLoc &dl(Op);
+  unsigned HwLen = Subtarget.getVectorLength();
+  auto *MaskN = cast<MaskedLoadStoreSDNode>(Op.getNode());
+  SDValue Mask = MaskN->getMask();
+  SDValue Chain = MaskN->getChain();
+  SDValue Base = MaskN->getBasePtr();
+  auto *MemOp = MaskN->getMemOperand();
+
+  unsigned Opc = Op->getOpcode();
+  assert(Opc == ISD::MLOAD || Opc == ISD::MSTORE);
+
+  if (Opc == ISD::MLOAD) {
+    MVT ValTy = ty(Op);
+    SDValue Load = DAG.getLoad(ValTy, dl, Chain, Base, MaskN->getMemOperand());
+    SDValue Thru = cast<MaskedLoadSDNode>(MaskN)->getPassThru();
+    if (isUndef(Thru))
+      return Load;
+    SDValue VSel = DAG.getNode(ISD::VSELECT, dl, ValTy, Mask, Load, Thru);
+    return DAG.getMergeValues({VSel, Load.getValue(1)}, dl);
+  }
+
+  // MSTORE
+  // HVX only has aligned masked stores.
+
+  // TODO: Fold negations of the mask into the store.
+  unsigned StoreOpc = Hexagon::V6_vS32b_qpred_ai;
+  SDValue Value = cast<MaskedStoreSDNode>(MaskN)->getValue();
+  SDValue Offset0 = DAG.getTargetConstant(0, dl, ty(Base));
+
+  if (MaskN->getAlign().value() % HwLen == 0) {
+    SDValue Store = getInstr(StoreOpc, dl, MVT::Other,
+                             {Mask, Base, Offset0, Value, Chain}, DAG);
+    DAG.setNodeMemRefs(cast<MachineSDNode>(Store.getNode()), {MemOp});
+    return Store;
+  }
+
+  // Unaligned case.
+  auto StoreAlign = [&](SDValue V, SDValue A) {
+    SDValue Z = getZero(dl, ty(V), DAG);
+    // TODO: use funnel shifts?
+    // vlalign(Vu,Vv,Rt) rotates the pair Vu:Vv left by Rt and takes the
+    // upper half.
+    SDValue LoV = getInstr(Hexagon::V6_vlalignb, dl, ty(V), {V, Z, A}, DAG);
+    SDValue HiV = getInstr(Hexagon::V6_vlalignb, dl, ty(V), {Z, V, A}, DAG);
+    return std::make_pair(LoV, HiV);
+  };
+
+  MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+  MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
+  SDValue MaskV = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, Mask);
+  VectorPair Tmp = StoreAlign(MaskV, Base);
+  VectorPair MaskU = {DAG.getNode(HexagonISD::V2Q, dl, BoolTy, Tmp.first),
+                      DAG.getNode(HexagonISD::V2Q, dl, BoolTy, Tmp.second)};
+  VectorPair ValueU = StoreAlign(Value, Base);
+
+  SDValue Offset1 = DAG.getTargetConstant(HwLen, dl, MVT::i32);
+  SDValue StoreLo =
+      getInstr(StoreOpc, dl, MVT::Other,
+               {MaskU.first, Base, Offset0, ValueU.first, Chain}, DAG);
+  SDValue StoreHi =
+      getInstr(StoreOpc, dl, MVT::Other,
+               {MaskU.second, Base, Offset1, ValueU.second, Chain}, DAG);
+  DAG.setNodeMemRefs(cast<MachineSDNode>(StoreLo.getNode()), {MemOp});
+  DAG.setNodeMemRefs(cast<MachineSDNode>(StoreHi.getNode()), {MemOp});
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, {StoreLo, StoreHi});
+}
+
 SDValue
 HexagonTargetLowering::SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const {
   assert(!Op.isMachineOpcode());
@@ -1648,45 +1724,81 @@
 
 SDValue
 HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const {
-  LSBaseSDNode *BN = cast<LSBaseSDNode>(Op.getNode());
-  assert(BN->isUnindexed());
-  MVT MemTy = BN->getMemoryVT().getSimpleVT();
+  auto *MemN = cast<MemSDNode>(Op.getNode());
+
+  MVT MemTy = MemN->getMemoryVT().getSimpleVT();
   if (!isHvxPairTy(MemTy))
     return Op;
 
   const SDLoc &dl(Op);
   unsigned HwLen = Subtarget.getVectorLength();
   MVT SingleTy = typeSplit(MemTy).first;
-  SDValue Chain = BN->getChain();
-  SDValue Base0 = BN->getBasePtr();
+  SDValue Chain = MemN->getChain();
+  SDValue Base0 = MemN->getBasePtr();
   SDValue Base1 = DAG.getMemBasePlusOffset(Base0, TypeSize::Fixed(HwLen), dl);
 
   MachineMemOperand *MOp0 = nullptr, *MOp1 = nullptr;
-  if (MachineMemOperand *MMO = BN->getMemOperand()) {
+  if (MachineMemOperand *MMO = MemN->getMemOperand()) {
     MachineFunction &MF = DAG.getMachineFunction();
     MOp0 = MF.getMachineMemOperand(MMO, 0, HwLen);
     MOp1 = MF.getMachineMemOperand(MMO, HwLen, HwLen);
   }
 
-  unsigned MemOpc = BN->getOpcode();
-  SDValue NewOp;
+  unsigned MemOpc = MemN->getOpcode();
 
   if (MemOpc == ISD::LOAD) {
+    assert(cast<LoadSDNode>(Op)->isUnindexed());
     SDValue Load0 = DAG.getLoad(SingleTy, dl, Chain, Base0, MOp0);
     SDValue Load1 = DAG.getLoad(SingleTy, dl, Chain, Base1, MOp1);
-    NewOp = DAG.getMergeValues(
-              { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, Load0, Load1),
-                DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                            Load0.getValue(1), Load1.getValue(1)) }, dl);
-  } else {
-    assert(MemOpc == ISD::STORE);
+    return DAG.getMergeValues(
+        { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, Load0, Load1),
+          DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                      Load0.getValue(1), Load1.getValue(1)) }, dl);
+  }
+  if (MemOpc == ISD::STORE) {
+    assert(cast<StoreSDNode>(Op)->isUnindexed());
     VectorPair Vals = opSplit(cast<StoreSDNode>(Op)->getValue(), dl, DAG);
     SDValue Store0 = DAG.getStore(Chain, dl, Vals.first, Base0, MOp0);
     SDValue Store1 = DAG.getStore(Chain, dl, Vals.second, Base1, MOp1);
-    NewOp = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store0, Store1);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store0, Store1);
+  }
+
+  assert(MemOpc == ISD::MLOAD || MemOpc == ISD::MSTORE);
+
+  auto MaskN = cast<MaskedLoadStoreSDNode>(Op);
+  assert(MaskN->isUnindexed());
+  VectorPair Masks = opSplit(MaskN->getMask(), dl, DAG);
+  SDValue Offset = DAG.getUNDEF(MVT::i32);
+
+  if (MemOpc == ISD::MLOAD) {
+    VectorPair Thru =
+        opSplit(cast<MaskedLoadSDNode>(Op)->getPassThru(), dl, DAG);
+    SDValue MLoad0 =
+        DAG.getMaskedLoad(SingleTy, dl, Chain, Base0, Offset, Masks.first,
+                          Thru.first, SingleTy, MOp0, ISD::UNINDEXED,
+                          ISD::NON_EXTLOAD, false);
+    SDValue MLoad1 =
+        DAG.getMaskedLoad(SingleTy, dl, Chain, Base1, Offset, Masks.second,
+                          Thru.second, SingleTy, MOp1, ISD::UNINDEXED,
+                          ISD::NON_EXTLOAD, false);
+    return DAG.getMergeValues(
+        { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, MLoad0, MLoad1),
+          DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                      MLoad0.getValue(1), MLoad1.getValue(1)) }, dl);
+  }
+  if (MemOpc == ISD::MSTORE) {
+    VectorPair Vals = opSplit(cast<MaskedStoreSDNode>(Op)->getValue(), dl, DAG);
+    SDValue MStore0 = DAG.getMaskedStore(Chain, dl, Vals.first, Base0, Offset,
+                                         Masks.first, SingleTy, MOp0,
+                                         ISD::UNINDEXED, false, false);
+    SDValue MStore1 = DAG.getMaskedStore(Chain, dl, Vals.second, Base1, Offset,
+                                         Masks.second, SingleTy, MOp1,
+                                         ISD::UNINDEXED, false, false);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MStore0, MStore1);
   }
 
-  return NewOp;
+  std::string Name = "Unexpected operation: " + Op->getOperationName(&DAG);
+  llvm_unreachable(Name.c_str());
 }
 
 SDValue
@@ -1749,6 +1861,8 @@
     case ISD::SETCC:
     case ISD::INTRINSIC_VOID:          return Op;
     case ISD::INTRINSIC_WO_CHAIN:      return LowerHvxIntrinsic(Op, DAG);
+    case ISD::MLOAD:
+    case ISD::MSTORE:                  return LowerHvxMaskedOp(Op, DAG);
     // Unaligned loads will be handled by the default lowering.
     case ISD::LOAD:                    return SDValue();
   }
@@ -1761,6 +1875,25 @@
 void
 HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N,
       SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  unsigned Opc = N->getOpcode();
+  SDValue Op(N, 0);
+
+  switch (Opc) {
+    case ISD::MLOAD:
+      if (isHvxPairTy(ty(Op))) {
+        SDValue S = SplitHvxMemOp(Op, DAG);
+        assert(S->getOpcode() == ISD::MERGE_VALUES);
+        Results.push_back(S.getOperand(0));
+        Results.push_back(S.getOperand(1));
+      }
+      break;
+    case ISD::MSTORE:
+      if (isHvxPairTy(ty(Op->getOperand(1)))) {    // Stored value
+        SDValue S = SplitHvxMemOp(Op, DAG);
+        Results.push_back(S);
+      }
+      break;
+  }
 }
 
 void
@@ -1783,6 +1916,8 @@
 SDValue
 HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
       const {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
   const SDLoc &dl(N);
   SDValue Op(N, 0);
 
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -2721,6 +2721,8 @@
   case Hexagon::PS_vloadrw_nt_ai:
   case Hexagon::V6_vL32b_ai:
   case Hexagon::V6_vS32b_ai:
+  case Hexagon::V6_vS32b_qpred_ai:
+  case Hexagon::V6_vS32b_nqpred_ai:
   case Hexagon::V6_vL32b_nt_ai:
   case Hexagon::V6_vS32b_nt_ai:
   case Hexagon::V6_vL32Ub_ai:
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -364,6 +364,14 @@
              (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 16)), (A2_tfrsi 16))>;
   }
 
+  // Take a pair of vectors Vt:Vs and shift them towards LSB by (Rt & HwLen).
+  def: Pat<(VecI8 (valign HVI8:$Vt, HVI8:$Vs, I32:$Rt)),
+           (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>;
+  def: Pat<(VecI16 (valign HVI16:$Vt, HVI16:$Vs, I32:$Rt)),
+           (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>;
+  def: Pat<(VecI32 (valign HVI32:$Vt, HVI32:$Vs, I32:$Rt)),
+           (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>;
+
   def: Pat<(HexagonVASL HVI8:$Vs, I32:$Rt),
            (V6_vpackeb (V6_vaslh (HiVec (VZxtb HvxVR:$Vs)), I32:$Rt),
                        (V6_vaslh (LoVec (VZxtb HvxVR:$Vs)), I32:$Rt))>;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -155,6 +155,9 @@
     return 1;
   }
 
+  bool isLegalMaskedStore(Type *DataType, Align Alignment);
+  bool isLegalMaskedLoad(Type *DataType, Align Alignment);
+
   /// @}
 
   int getUserCost(const User *U, ArrayRef<const Value *> Operands,
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -35,6 +35,9 @@
   cl::init(true), cl::Hidden,
   cl::desc("Control lookup table emission on Hexagon target"));
 
+static cl::opt<bool> HexagonMaskedVMem("hexagon-masked-vmem", cl::init(true),
+  cl::Hidden, cl::desc("Enable loop vectorizer for HVX"));
+
 // Constant "cost factor" to make floating point operations more expensive
 // in terms of vectorization cost. This isn't the best way, but it should
 // do. Ultimately, the cost should use cycles.
@@ -45,8 +48,7 @@
 }
 
 bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const {
-  assert(VecTy->isVectorTy());
-  if (isa<ScalableVectorType>(VecTy))
+  if (!VecTy->isVectorTy() || isa<ScalableVectorType>(VecTy))
     return false;
   // Avoid types like <2 x i32*>.
   if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy())
@@ -308,6 +310,14 @@
   return 1;
 }
 
+bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/) {
+  return HexagonMaskedVMem && isTypeForHVX(DataType);
+}
+
+bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/) {
+  return HexagonMaskedVMem && isTypeForHVX(DataType);
+}
+
 /// --- Vector TTI end ---
 
 unsigned HexagonTTIImpl::getPrefetchDistance() const {
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -840,7 +840,7 @@
 
   return FS->findFunctionSamplesAt(LineLocation(FunctionSamples::getOffset(DIL),
                                                 DIL->getBaseDiscriminator()),
-                                   CalleeName);
+                                   CalleeName, Reader->getRemapper());
 }
 
 /// Returns a vector of FunctionSamples that are the indirect call targets
@@ -903,7 +903,7 @@
 
   auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
   if (it.second)
-    it.first->second = Samples->findFunctionSamples(DIL);
+    it.first->second = Samples->findFunctionSamples(DIL, Reader->getRemapper());
   return it.first->second;
 }
 
@@ -1050,24 +1050,23 @@
                                      PSI->getOrCompHotCountThreshold());
             continue;
           }
-          auto CalleeFunctionName = FS->getFuncName();
+          if (!callsiteIsHot(FS, PSI))
+            continue;
+
+          const char *Reason = "Callee function not available";
+          // R->getValue() != &F is to prevent promoting a recursive call.
           // If it is a recursive call, we do not inline it as it could bloat
           // the code exponentially. There is way to better handle this, e.g.
           // clone the caller first, and inline the cloned caller if it is
           // recursive. As llvm does not inline recursive calls, we will
           // simply ignore it instead of handling it explicitly.
-          if (CalleeFunctionName == F.getName())
-            continue;
-
-          if (!callsiteIsHot(FS, PSI))
-            continue;
-
-          const char *Reason = "Callee function not available";
+          auto CalleeFunctionName = FS->getFuncName();
           auto R = SymbolMap.find(CalleeFunctionName);
           if (R != SymbolMap.end() && R->getValue() &&
               !R->getValue()->isDeclaration() &&
               R->getValue()->getSubprogram() &&
               R->getValue()->hasFnAttribute("use-sample-profile") &&
+              R->getValue() != &F &&
               isLegalToPromote(*I, R->getValue(), &Reason)) {
             uint64_t C = FS->getEntrySamples();
             auto &DI =
@@ -1854,7 +1853,6 @@
                                            FunctionAnalysisManager *FAM) {
   auto &Ctx = M.getContext();
 
-  std::unique_ptr<SampleProfileReaderItaniumRemapper> RemapReader;
   auto ReaderOrErr =
       SampleProfileReader::create(Filename, Ctx, RemappingFilename);
   if (std::error_code EC = ReaderOrErr.getError()) {
@@ -1910,6 +1908,7 @@
   for (const auto &I : Reader->getProfiles())
     TotalCollectedSamples += I.second.getTotalSamples();
 
+  auto Remapper = Reader->getRemapper();
   // Populate the symbol map.
   for (const auto &N_F : M.getValueSymbolTable()) {
     StringRef OrigName = N_F.getKey();
@@ -1927,6 +1926,15 @@
       // to nullptr to avoid confusion.
       if (!r.second)
         r.first->second = nullptr;
+      OrigName = NewName;
+    }
+    // Insert the remapped names into SymbolMap.
+    if (Remapper) {
+      if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
+        if (*MapName == OrigName)
+          continue;
+        SymbolMap.insert(std::make_pair(*MapName, F));
+      }
     }
   }
 
diff --git a/llvm/test/Analysis/ConstantFolding/vscale-shufflevector.ll b/llvm/test/Analysis/ConstantFolding/vscale-shufflevector.ll
--- a/llvm/test/Analysis/ConstantFolding/vscale-shufflevector.ll
+++ b/llvm/test/Analysis/ConstantFolding/vscale-shufflevector.ll
@@ -15,9 +15,7 @@
 ; the compiler. It happens to be the case that this will be the result.
 
 ; CHECK-LABEL: define <vscale x 8 x i1> @vscale_version()
-; CHECK-NEXT: %splatter = insertelement <vscale x 8 x i1> undef, i1 true, i32 0
-; CHECK-NEXT: %foo = shufflevector <vscale x 8 x i1> %splatter, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT: ret <vscale x 8 x i1> %foo
+; CHECK-NEXT: ret <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> undef, i1 true, i32 0), <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer)
 
 define <vscale x 8 x i1> @vscale_version() {
   %splatter = insertelement <vscale x 8 x i1> undef, i1 true, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,-unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,+unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
 
 ; FIXME: We don't get cases where the address was an SGPR because we
 ; get a copy to the address register for each one.
@@ -317,7 +318,9 @@
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-NOT: ds_read2_b32
+; CI-COUNT-4: ds_read_u8
+; GFX9-ALIGNED-4: ds_read_u8
+; GFX9-UNALIGNED-4: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -336,7 +339,9 @@
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-NOT: ds_read2_b32
+; CI-COUNT-2: ds_read_u16
+; GFX9-ALIGNED-2: ds_read_u16
+; GFX9-UNALIGNED-4: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -655,6 +660,22 @@
   ret <2 x float> %r1
 }
 
+@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
+
+; GCN-LABEL: {{^}}read2_v2i32_align1_odd_offset:
+; CI-COUNT-8: ds_read_u8
+
+; GFX9-ALIGNED-COUNT-8: ds_read_u8
+
+; GFX9-UNALIGNED: v_mov_b32_e32 [[BASE_ADDR:v[0-9]+]], 0x41{{$}}
+; GFX9-UNALIGNED: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE_ADDR]] offset1:1{{$}}
+define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* %out) {
+entry:
+  %load = load <2 x i32>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
+  store <2 x i32> %load, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
 declare void @void_func_void() #3
 
 declare i32 @llvm.amdgcn.workgroup.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,-unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,+unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
 
 @lds = addrspace(3) global [512 x float] undef, align 4
 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
@@ -523,6 +524,21 @@
   ret void
 }
 
+@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
+
+; GCN-LABEL: {{^}}write2_v2i32_align1_odd_offset:
+; CI-COUNT-8: ds_write_b8
+
+; GFX9-ALIGNED-COUNT-8: ds_write_b8
+
+; GFX9-UNALIGNED: v_mov_b32_e32 [[BASE_ADDR:v[0-9]+]], 0x41{{$}}
+; GFX9-UNALIGNED: ds_write2_b32 [[BASE_ADDR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
+define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
+entry:
+  store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workgroup.id.x() #1
 declare i32 @llvm.amdgcn.workgroup.id.y() #1
 declare i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/masked-vmem-basic.ll b/llvm/test/CodeGen/Hexagon/autohvx/masked-vmem-basic.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/masked-vmem-basic.ll
@@ -0,0 +1,35 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; CHECK-LABEL: f0:
+; CHECK: vmemu
+; CHECK: vmux
+define <128 x i8> @f0(<128 x i8>* %a0, i32 %a1, i32 %a2) #0 {
+  %q0 = call <128 x i1> @llvm.hexagon.V6.pred.scalar2.128B(i32 %a2)
+  %v0 = call <32 x i32> @llvm.hexagon.V6.lvsplatb.128B(i32 %a1)
+  %v1 = bitcast <32 x i32> %v0 to <128 x i8>
+  %v2 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %a0, i32 4, <128 x i1> %q0, <128 x i8> %v1)
+  ret <128 x i8> %v2
+}
+
+; CHECK-LABEL: f1:
+; CHECK: vlalign
+; CHECK: if (q{{.}}) vmem{{.*}} = v
+define void @f1(<128 x i8>* %a0, i32 %a1, i32 %a2) #0 {
+  %q0 = call <128 x i1> @llvm.hexagon.V6.pred.scalar2.128B(i32 %a2)
+  %v0 = call <32 x i32> @llvm.hexagon.V6.lvsplatb.128B(i32 %a1)
+  %v1 = bitcast <32 x i32> %v0 to <128 x i8>
+  call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> %v1, <128 x i8>* %a0, i32 4, <128 x i1> %q0)
+  ret void
+}
+
+declare <128 x i1> @llvm.hexagon.V6.pred.scalar2.128B(i32) #1
+declare <32 x i32> @llvm.hexagon.V6.lvsplatb.128B(i32) #1
+declare <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>*, i32 immarg, <128 x i1>, <128 x i8>) #2
+declare void @llvm.masked.store.v128i8.p0v128i8(<128 x i8>, <128 x i8>*, i32 immarg, <128 x i1>) #2
+
+attributes #0 = { "target-cpu"="hexagonv65" "target-features"="+hvx,+hvx-length128b" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { argmemonly nounwind readonly willreturn }
+attributes #3 = { argmemonly nounwind willreturn }
+
+
diff --git a/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll b/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll
--- a/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll
+++ b/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -hexagon-instsimplify=0  < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-instsimplify=0 -hexagon-masked-vmem=0 < %s | FileCheck %s
 
 ; Test that LLVM does not assert and bitcast v64i1 to i64 is lowered
 ; without crashing.
diff --git a/llvm/test/CodeGen/Hexagon/store-vector-pred.ll b/llvm/test/CodeGen/Hexagon/store-vector-pred.ll
--- a/llvm/test/CodeGen/Hexagon/store-vector-pred.ll
+++ b/llvm/test/CodeGen/Hexagon/store-vector-pred.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -hexagon-instsimplify=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-instsimplify=0 -hexagon-masked-vmem=0 < %s | FileCheck %s
 
 ; This test checks that store a vector predicate of type v128i1 is lowered
 ; without crashing.
diff --git a/llvm/test/LTO/ARM/lto-linking-metadata.ll b/llvm/test/LTO/ARM/lto-linking-metadata.ll
--- a/llvm/test/LTO/ARM/lto-linking-metadata.ll
+++ b/llvm/test/LTO/ARM/lto-linking-metadata.ll
@@ -1,7 +1,8 @@
 ; RUN: opt %s -o %t1.bc
 
-; RUN: llvm-lto %t1.bc -o %t1.save.opt -save-merged-module -O1 --exported-symbol=foo
+; RUN: llvm-lto %t1.bc -o %t1.save.opt -save-linked-module -save-merged-module -O1 --exported-symbol=foo
 ; RUN: llvm-dis < %t1.save.opt.merged.bc | FileCheck %s
+; RUN: llvm-dis < %t1.save.opt.linked.bc | FileCheck %s --check-prefix=CHECK-LINKED
 
 ; RUN: llvm-lto2 run %t1.bc -o %t.out.o -save-temps \
 ; RUN:     -r=%t1.bc,foo,pxl
@@ -17,3 +18,6 @@
 
 ; CHECK: !llvm.module.flags = !{[[MD_NUM:![0-9]+]]}
 ; CHECK: [[MD_NUM]] = !{i32 1, !"LTOPostLink", i32 1}
+
+; CHECK-LINKED: @foo
+; CHECK-LINKED-NOT: LTOPostLink
diff --git a/llvm/test/MC/AArch64/SVE/st1b.s b/llvm/test/MC/AArch64/SVE/st1b.s
--- a/llvm/test/MC/AArch64/SVE/st1b.s
+++ b/llvm/test/MC/AArch64/SVE/st1b.s
@@ -168,3 +168,27 @@
 // CHECK-ENCODING: [0xff,0xbf,0x5f,0xe4]
 // CHECK-ERROR: instruction requires: sve
 // CHECK-UNKNOWN: ff bf 5f e4 <unknown>
+
+st1b    { z0.s }, p7, [z0.s, #0]
+// CHECK-INST: st1b    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0x60,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 60 e4 <unknown>
+
+st1b    { z0.s }, p7, [z0.s]
+// CHECK-INST: st1b    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0x60,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 60 e4 <unknown>
+
+st1b    { z0.d }, p7, [z0.d, #0]
+// CHECK-INST: st1b    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0x40,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 40 e4 <unknown>
+
+st1b    { z0.d }, p7, [z0.d]
+// CHECK-INST: st1b    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0x40,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 40 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st1d.s b/llvm/test/MC/AArch64/SVE/st1d.s
--- a/llvm/test/MC/AArch64/SVE/st1d.s
+++ b/llvm/test/MC/AArch64/SVE/st1d.s
@@ -78,3 +78,15 @@
 // CHECK-ENCODING: [0xff,0xbf,0xdf,0xe5]
 // CHECK-ERROR: instruction requires: sve
 // CHECK-UNKNOWN: ff bf df e5 <unknown>
+
+st1d    { z0.d }, p7, [z0.d, #0]
+// CHECK-INST: st1d    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc c0 e5 <unknown>
+
+st1d    { z0.d }, p7, [z0.d]
+// CHECK-INST: st1d    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc c0 e5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st1h.s b/llvm/test/MC/AArch64/SVE/st1h.s
--- a/llvm/test/MC/AArch64/SVE/st1h.s
+++ b/llvm/test/MC/AArch64/SVE/st1h.s
@@ -168,3 +168,27 @@
 // CHECK-ENCODING: [0xff,0xbf,0xdf,0xe4]
 // CHECK-ERROR: instruction requires: sve
 // CHECK-UNKNOWN: ff bf df e4 <unknown>
+
+st1h    { z0.s }, p7, [z0.s, #0]
+// CHECK-INST: st1h    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0xe0,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc e0 e4 <unknown>
+
+st1h    { z0.s }, p7, [z0.s]
+// CHECK-INST: st1h    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0xe0,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc e0 e4 <unknown>
+
+st1h    { z0.d }, p7, [z0.d, #0]
+// CHECK-INST: st1h    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc c0 e4 <unknown>
+
+st1h    { z0.d }, p7, [z0.d]
+// CHECK-INST: st1h    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc c0 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st1w.s b/llvm/test/MC/AArch64/SVE/st1w.s
--- a/llvm/test/MC/AArch64/SVE/st1w.s
+++ b/llvm/test/MC/AArch64/SVE/st1w.s
@@ -138,3 +138,27 @@
 // CHECK-ENCODING: [0xff,0xbf,0x5f,0xe5]
 // CHECK-ERROR: instruction requires: sve
 // CHECK-UNKNOWN: ff bf 5f e5 <unknown>
+
+st1w    { z0.s }, p7, [z0.s, #0]
+// CHECK-INST: st1w    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0x60,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 60 e5 <unknown>
+
+st1w    { z0.s }, p7, [z0.s]
+// CHECK-INST: st1w    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0x60,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 60 e5 <unknown>
+
+st1w    { z0.d }, p7, [z0.d, #0]
+// CHECK-INST: st1w    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0x40,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 40 e5 <unknown>
+
+st1w    { z0.d }, p7, [z0.d]
+// CHECK-INST: st1w    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0x40,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 40 e5 <unknown>
diff --git a/llvm/test/Transforms/InstSimplify/vscale.ll b/llvm/test/Transforms/InstSimplify/vscale.ll
--- a/llvm/test/Transforms/InstSimplify/vscale.ll
+++ b/llvm/test/Transforms/InstSimplify/vscale.ll
@@ -51,6 +51,23 @@
   ret <vscale x 4 x i32> %r
 }
 
+define <vscale x 4 x i32> @insertelement_inline_to_ret() {
+; CHECK-LABEL: @insertelement_inline_to_ret(
+; CHECK-NEXT:    ret <vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 1, i32 0)
+;
+  %i = insertelement <vscale x 4 x i32> undef, i32 1, i32 0
+  ret <vscale x 4 x i32> %i
+}
+
+define <vscale x 4 x i32> @insertelement_shufflevector_inline_to_ret() {
+; CHECK-LABEL: @insertelement_shufflevector_inline_to_ret(
+; CHECK-NEXT:    ret <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 1, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+;
+  %i = insertelement <vscale x 4 x i32> undef, i32 1, i32 0
+  %i2 = shufflevector <vscale x 4 x i32> %i, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> %i2
+}
+
 ; extractelement
 
 define i32 @extractelement_idx_undef(<vscale x 4 x i32> %a) {
@@ -120,6 +137,16 @@
    ret <vscale x 2 x i1> %cmp
 }
 
+define <vscale x 4 x float> @bitcast() {
+; CHECK-LABEL: @bitcast(
+; CHECK-NEXT:    ret <vscale x 4 x float> bitcast (<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 1, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x float>)
+;
+  %i1 = insertelement <vscale x 4 x i32> undef, i32 1, i32 0
+  %i2 = shufflevector <vscale x 4 x i32> %i1, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %i3 = bitcast <vscale x 4 x i32> %i2 to <vscale x 4 x float>
+  ret <vscale x 4 x float> %i3
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Memory Access and Addressing Operations
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/remap-2.prof b/llvm/test/Transforms/SampleProfile/Inputs/remap-2.prof
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/remap-2.prof
@@ -0,0 +1,16 @@
+test:15680:2500
+ 1: 100
+ 4: 100
+ 5: 3000 xoo:1000
+ 5: _ZN3foo3barERKN1N1XINS_4quuxEEE:2000
+  1: 2000
+ 6: _ZN1N1XE:2500
+  1: 2500
+
+_ZN1N1X1YE:15680:2500
+ 1: 100
+ 4: 100
+ 5: 3000 xoo:1000
+ 5: _ZN1N1X1YE:2000
+  1: 2000
+
diff --git a/llvm/test/Transforms/SampleProfile/remap-2.ll b/llvm/test/Transforms/SampleProfile/remap-2.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/remap-2.ll
@@ -0,0 +1,74 @@
+; RUN: opt %s -passes=sample-profile -sample-profile-file=%S/Inputs/remap-2.prof -sample-profile-remapping-file=%S/Inputs/remap.map -S | FileCheck %s
+; Check profile remapping works for searching inline instance, searching
+; indirect call promotion candidate and prevent recursive inline.
+
+@x.addr = common global i32 zeroinitializer, align 16
+@y.addr = common global i32 zeroinitializer, align 16
+
+define i32 @_ZN3foo3barERKN1M1XINS_6detail3quxEEE() #0 !dbg !9 {
+entry:
+  %t0 = load i32, i32* @x.addr, align 4
+  %t1 = load i32, i32* @y.addr, align 4
+  %add = add nsw i32 %t0, %t1
+  ret i32 %add
+}
+
+define i32 @_ZN1M1XE() #0 !dbg !10 {
+entry:
+  %t0 = load i32, i32* @x.addr, align 4
+  %t1 = load i32, i32* @y.addr, align 4
+  %sub = sub nsw i32 %t0, %t1
+  ret i32 %sub
+}
+
+define void @test(i32 ()*) #0 !dbg !4 {
+  %t2 = alloca i32 ()*
+  store i32 ()* %0, i32 ()** %t2
+  %t3 = load i32 ()*, i32 ()** %t2
+; Check call i32 %t3 has been indirect call promoted and call i32 @_ZN1M1XE
+; has been inlined.
+; CHECK-LABEL: @test(
+; CHECK: icmp eq i32 ()* %t3, @_ZN3foo3barERKN1M1XINS_6detail3quxEEE
+; CHECK-NOT: call i32 @_ZN1M1XE
+  %t4 = call i32 %t3(), !dbg !7
+  %t5 = call i32 @_ZN1M1XE(), !dbg !8
+  ret void
+}
+
+define void @_ZN1M1X1YE(i32 ()*) #0 !dbg !11 {
+  %t2 = alloca i32 ()*
+  store i32 ()* %0, i32 ()** %t2
+  %t3 = load i32 ()*, i32 ()** %t2
+; Check call i32 %t3 has got its profile but is not indirect call promoted
+; because the promotion candidate is a recursive call to the current function.
+; CHECK-LABEL: @_ZN1M1X1YE(
+; CHECK: call i32 %t3(), {{.*}} !prof ![[PROFID:[0-9]+]]
+; CHECK-NOT: icmp eq i32 ()* %t3, @_ZN1M1X1YE
+  %t4 = call i32 %t3(), !dbg !12
+  ret void
+}
+
+; CHECK: ![[PROFID]] = !{!"VP", i32 0, i64 3000
+
+attributes #0 = { "use-sample-profile" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!13, !14}
+!llvm.ident = !{!15}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "calls.cc", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "calls.cc", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !DILocation(line: 8, scope: !4)
+!8 = !DILocation(line: 9, scope: !4)
+!9 = distinct !DISubprogram(name: "_ZN3foo3barERKN1M1XINS_6detail3quxEEE", line: 15, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!10 = distinct !DISubprogram(name: "_ZN1M1XE", line: 20, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!11 = distinct !DISubprogram(name: "_ZN1M1X1YE", line: 25, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!12 = !DILocation(line: 30, scope: !11)
+!13 = !{i32 2, !"Dwarf Version", i32 4}
+!14 = !{i32 1, !"Debug Info Version", i32 3}
+!15 = !{!"clang version 3.5 "}
+
diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp
--- a/llvm/tools/llvm-lto/llvm-lto.cpp
+++ b/llvm/tools/llvm-lto/llvm-lto.cpp
@@ -181,6 +181,10 @@
     cl::desc("Save ThinLTO generated object files using filenames created in "
              "the given directory."));
 
+static cl::opt<bool> SaveLinkedModuleFile(
+    "save-linked-module", cl::init(false),
+    cl::desc("Write linked LTO module to file before optimize"));
+
 static cl::opt<bool>
     SaveModuleFile("save-merged-module", cl::init(false),
                    cl::desc("Write merged LTO module to file before CodeGen"));
@@ -1029,6 +1033,15 @@
     CodeGen.setFileType(FT.getValue());
 
   if (!OutputFilename.empty()) {
+    if (SaveLinkedModuleFile) {
+      std::string ModuleFilename = OutputFilename;
+      ModuleFilename += ".linked.bc";
+      std::string ErrMsg;
+
+      if (!CodeGen.writeMergedModules(ModuleFilename))
+        error("writing linked module failed.");
+    }
+
     if (!CodeGen.optimize(DisableVerify, DisableInline, DisableGVNLoadPRE,
                           DisableLTOVectorization)) {
       // Diagnostic messages should have been printed by the handler.
diff --git a/llvm/unittests/ProfileData/SampleProfTest.cpp b/llvm/unittests/ProfileData/SampleProfTest.cpp
--- a/llvm/unittests/ProfileData/SampleProfTest.cpp
+++ b/llvm/unittests/ProfileData/SampleProfTest.cpp
@@ -89,8 +89,8 @@
     auto VerifySummary = [IsPartialProfile, PartialProfileRatio](
                              ProfileSummary &Summary) mutable {
       ASSERT_EQ(ProfileSummary::PSK_Sample, Summary.getKind());
-      ASSERT_EQ(137392u, Summary.getTotalCount());
-      ASSERT_EQ(8u, Summary.getNumCounts());
+      ASSERT_EQ(138211u, Summary.getTotalCount());
+      ASSERT_EQ(10u, Summary.getNumCounts());
       ASSERT_EQ(4u, Summary.getNumFunctions());
       ASSERT_EQ(1437u, Summary.getMaxFunctionCount());
       ASSERT_EQ(60351u, Summary.getMaxCount());
@@ -112,7 +112,7 @@
       ASSERT_EQ(60000u, EightyPerc->MinCount);
       ASSERT_EQ(12557u, NinetyPerc->MinCount);
       ASSERT_EQ(12557u, NinetyFivePerc->MinCount);
-      ASSERT_EQ(610u, NinetyNinePerc->MinCount);
+      ASSERT_EQ(600u, NinetyNinePerc->MinCount);
     };
     VerifySummary(Summary);
 
@@ -155,6 +155,22 @@
     FooSamples.addBodySamples(8, 0, 60351);
     FooSamples.addBodySamples(10, 0, 605);
 
+    // Add inline instance with name "_Z3gooi".
+    StringRef GooName("_Z3gooi");
+    auto &GooSamples =
+        FooSamples.functionSamplesAt(LineLocation(7, 0))[GooName.str()];
+    GooSamples.setName(GooName);
+    GooSamples.addTotalSamples(502);
+    GooSamples.addBodySamples(3, 0, 502);
+
+    // Add inline instance with name "_Z3hooi".
+    StringRef HooName("_Z3hooi");
+    auto &HooSamples =
+        GooSamples.functionSamplesAt(LineLocation(9, 0))[HooName.str()];
+    HooSamples.setName(HooName);
+    HooSamples.addTotalSamples(317);
+    HooSamples.addBodySamples(4, 0, 317);
+
     StringRef BarName("_Z3bari");
     FunctionSamples BarSamples;
     BarSamples.setName(BarName);
@@ -197,6 +213,8 @@
       createRemapFile(RemapPath, RemapFile);
       FooName = "_Z4fauxi";
       BarName = "_Z3barl";
+      GooName = "_Z3gool";
+      HooName = "_Z3hool";
     }
 
     M.getOrInsertFunction(FooName, fn_type);
@@ -235,6 +253,33 @@
     ASSERT_EQ(7711u, ReadFooSamples->getTotalSamples());
     ASSERT_EQ(610u, ReadFooSamples->getHeadSamples());
 
+    // Try to find a FunctionSamples with GooName at given callsites containing
+    // inline instance for GooName. Test the correct FunctionSamples can be
+    // found with Remapper support.
+    const FunctionSamples *ReadGooSamples =
+        ReadFooSamples->findFunctionSamplesAt(LineLocation(7, 0), GooName,
+                                              Reader->getRemapper());
+    ASSERT_TRUE(ReadGooSamples != nullptr);
+    ASSERT_EQ(502u, ReadGooSamples->getTotalSamples());
+
+    // Try to find a FunctionSamples with GooName at given callsites containing
+    // no inline instance for GooName. Test no FunctionSamples will be
+    // found with Remapper support.
+    const FunctionSamples *ReadGooSamplesAgain =
+        ReadFooSamples->findFunctionSamplesAt(LineLocation(9, 0), GooName,
+                                              Reader->getRemapper());
+    ASSERT_TRUE(ReadGooSamplesAgain == nullptr);
+
+    // The inline instance of Hoo is inside of the inline instance of Goo.
+    // Try to find a FunctionSamples with HooName at given callsites containing
+    // inline instance for HooName. Test the correct FunctionSamples can be
+    // found with Remapper support.
+    const FunctionSamples *ReadHooSamples =
+        ReadGooSamples->findFunctionSamplesAt(LineLocation(9, 0), HooName,
+                                              Reader->getRemapper());
+    ASSERT_TRUE(ReadHooSamples != nullptr);
+    ASSERT_EQ(317u, ReadHooSamples->getTotalSamples());
+
     FunctionSamples *ReadBarSamples = Reader->getSamplesFor(BarName);
     ASSERT_TRUE(ReadBarSamples != nullptr);
     if (!UseMD5) {
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -213,6 +213,7 @@
       "HAVE_SIGALTSTACK=",
       "HAVE_STRERROR_R=",
       "HAVE_SYSCONF=",
+      "HAVE_SYSEXITS_H=",
       "HAVE_SYS_IOCTL_H=",
       "HAVE_SYS_MMAN_H=",
       "HAVE_SYS_PARAM_H=",
@@ -249,6 +250,7 @@
       "HAVE_SIGALTSTACK=1",
       "HAVE_STRERROR_R=1",
       "HAVE_SYSCONF=1",
+      "HAVE_SYSEXITS_H=1",
       "HAVE_SYS_IOCTL_H=1",
       "HAVE_SYS_MMAN_H=1",
       "HAVE_SYS_PARAM_H=1",
diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h
--- a/polly/include/polly/ScopInfo.h
+++ b/polly/include/polly/ScopInfo.h
@@ -2319,7 +2319,7 @@
   ///                      ScopBuilder::buildAccessRelations. Therefore, if this
   ///                      method is called before buildAccessRelations, false
   ///                      must be passed.
-  void removeStmts(std::function<bool(ScopStmt &)> ShouldDelete,
+  void removeStmts(function_ref<bool(ScopStmt &)> ShouldDelete,
                    bool AfterHoisting = true);
 
   /// Get an isl string representing the context.
diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp
--- a/polly/lib/Analysis/ScopInfo.cpp
+++ b/polly/lib/Analysis/ScopInfo.cpp
@@ -1752,7 +1752,7 @@
   }
 }
 
-void Scop::removeStmts(std::function<bool(ScopStmt &)> ShouldDelete,
+void Scop::removeStmts(function_ref<bool(ScopStmt &)> ShouldDelete,
                        bool AfterHoisting) {
   for (auto StmtIt = Stmts.begin(), StmtEnd = Stmts.end(); StmtIt != StmtEnd;) {
     if (!ShouldDelete(*StmtIt)) {
@@ -1773,40 +1773,39 @@
 }
 
 void Scop::removeStmtNotInDomainMap() {
-  auto ShouldDelete = [this](ScopStmt &Stmt) -> bool {
+  removeStmts([this](ScopStmt &Stmt) -> bool {
     isl::set Domain = DomainMap.lookup(Stmt.getEntryBlock());
     if (!Domain)
       return true;
     return Domain.is_empty();
-  };
-  removeStmts(ShouldDelete, false);
+  });
 }
 
 void Scop::simplifySCoP(bool AfterHoisting) {
-  auto ShouldDelete = [AfterHoisting](ScopStmt &Stmt) -> bool {
-    // Never delete statements that contain calls to debug functions.
-    if (hasDebugCall(&Stmt))
-      return false;
-
-    bool RemoveStmt = Stmt.isEmpty();
-
-    // Remove read only statements only after invariant load hoisting.
-    if (!RemoveStmt && AfterHoisting) {
-      bool OnlyRead = true;
-      for (MemoryAccess *MA : Stmt) {
-        if (MA->isRead())
-          continue;
-
-        OnlyRead = false;
-        break;
-      }
-
-      RemoveStmt = OnlyRead;
-    }
-    return RemoveStmt;
-  };
-
-  removeStmts(ShouldDelete, AfterHoisting);
+  removeStmts(
+      [AfterHoisting](ScopStmt &Stmt) -> bool {
+        // Never delete statements that contain calls to debug functions.
+        if (hasDebugCall(&Stmt))
+          return false;
+
+        bool RemoveStmt = Stmt.isEmpty();
+
+        // Remove read only statements only after invariant load hoisting.
+        if (!RemoveStmt && AfterHoisting) {
+          bool OnlyRead = true;
+          for (MemoryAccess *MA : Stmt) {
+            if (MA->isRead())
+              continue;
+
+            OnlyRead = false;
+            break;
+          }
+
+          RemoveStmt = OnlyRead;
+        }
+        return RemoveStmt;
+      },
+      AfterHoisting);
 }
 
 InvariantEquivClassTy *Scop::lookupInvariantEquivClass(Value *Val) {
diff --git a/polly/lib/Transform/Simplify.cpp b/polly/lib/Transform/Simplify.cpp
--- a/polly/lib/Transform/Simplify.cpp
+++ b/polly/lib/Transform/Simplify.cpp
@@ -169,12 +169,11 @@
   void removeEmptyDomainStmts() {
     size_t NumStmtsBefore = S->getSize();
 
-    auto ShouldDelete = [](ScopStmt &Stmt) -> bool {
+    S->removeStmts([](ScopStmt &Stmt) -> bool {
       auto EffectiveDomain =
           Stmt.getDomain().intersect_params(Stmt.getParent()->getContext());
       return EffectiveDomain.is_empty();
-    };
-    S->removeStmts(ShouldDelete);
+    });
 
     assert(NumStmtsBefore >= S->getSize());
     EmptyDomainsRemoved = NumStmtsBefore - S->getSize();