diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -145,6 +145,7 @@
                                               ///< linker.
 CODEGENOPT(MergeAllConstants , 1, 1) ///< Merge identical constants.
 CODEGENOPT(MergeFunctions    , 1, 0) ///< Set when -fmerge-functions is enabled.
+CODEGENOPT(HeapProf          , 1, 0) ///< Set when -fmemprof is enabled.
 CODEGENOPT(MSVolatile        , 1, 0) ///< Set when /volatile:ms is enabled.
 CODEGENOPT(NoCommon          , 1, 0) ///< Set when -fno-common or C++ is enabled.
 CODEGENOPT(NoDwarfDirectoryAsm , 1, 0) ///< Set when -fno-dwarf-directory-asm is
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -995,6 +995,8 @@
 def fsymbol_partition_EQ : Joined<["-"], "fsymbol-partition=">, Group<f_Group>,
   Flags<[CC1Option]>;
 
+defm memprof : OptInFFlag<"memprof", "Enable", "Disable", " heap memory profiling">;
+
 // Begin sanitizer flags. These should all be core options exposed in all driver
 // modes.
 let Flags = [CC1Option, CoreOption] in {
diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h
--- a/clang/include/clang/Driver/SanitizerArgs.h
+++ b/clang/include/clang/Driver/SanitizerArgs.h
@@ -55,13 +55,15 @@
   bool MinimalRuntime = false;
   // True if cross-dso CFI support if provided by the system (i.e. Android).
   bool ImplicitCfiRuntime = false;
+  bool NeedsHeapProfRt = false;
 
- public:
+public:
   /// Parses the sanitizer arguments from an argument list.
   SanitizerArgs(const ToolChain &TC, const llvm::opt::ArgList &Args);
 
   bool needsSharedRt() const { return SharedRuntime; }
 
+  bool needsHeapProfRt() const { return NeedsHeapProfRt; }
   bool needsAsanRt() const { return Sanitizers.has(SanitizerKind::Address); }
   bool needsHwasanRt() const {
     return Sanitizers.has(SanitizerKind::HWAddress);
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -67,6 +67,7 @@
 #include "llvm/Transforms/Instrumentation/BoundsChecking.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
+#include "llvm/Transforms/Instrumentation/HeapProfiler.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
 #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
@@ -267,6 +268,12 @@
   return false;
 }
 
+static void addHeapProfilerPasses(const PassManagerBuilder &Builder,
+                                  legacy::PassManagerBase &PM) {
+  PM.add(createHeapProfilerFunctionPass());
+  PM.add(createModuleHeapProfilerLegacyPassPass());
+}
+
 static void addAddressSanitizerPasses(const PassManagerBuilder &Builder,
                                       legacy::PassManagerBase &PM) {
   const PassManagerBuilderWrapper &BuilderWrapper =
@@ -662,6 +669,13 @@
   if (LangOpts.Coroutines)
     addCoroutinePassesToExtensionPoints(PMBuilder);
 
+  if (CodeGenOpts.HeapProf) {
+    PMBuilder.addExtension(PassManagerBuilder::EP_OptimizerLast,
+                           addHeapProfilerPasses);
+    PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0,
+                           addHeapProfilerPasses);
+  }
+
   if (LangOpts.Sanitize.has(SanitizerKind::LocalBounds)) {
     PMBuilder.addExtension(PassManagerBuilder::EP_ScalarOptimizerLate,
                            addBoundsCheckingPass);
@@ -1367,6 +1381,11 @@
       }
     }
 
+    if (CodeGenOpts.HeapProf) {
+      MPM.addPass(createModuleToFunctionPassAdaptor(HeapProfilerPass()));
+      MPM.addPass(ModuleHeapProfilerPass());
+    }
+
     if (LangOpts.Sanitize.has(SanitizerKind::HWAddress)) {
       bool Recover = CodeGenOpts.SanitizeRecover.has(SanitizerKind::HWAddress);
       MPM.addPass(HWAddressSanitizerPass(
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -866,6 +866,9 @@
                                 LinkCXXRuntimes) ||
                     D.CCCIsCXX();
 
+  NeedsHeapProfRt =
+      Args.hasFlag(options::OPT_fmemprof, options::OPT_fno_memprof, false);
+
   // Finally, initialize the set of available and recoverable sanitizers.
   Sanitizers.Mask |= Kinds;
   RecoverableSanitizers.Mask |= RecoverableKinds;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4223,6 +4223,9 @@
   if (Args.getLastArg(options::OPT_save_temps_EQ))
     Args.AddLastArg(CmdArgs, options::OPT_save_temps_EQ);
 
+  if (Args.hasFlag(options::OPT_fmemprof, options::OPT_fno_memprof, false))
+    Args.AddLastArg(CmdArgs, options::OPT_fmemprof);
+
   // Embed-bitcode option.
   // Only white-listed flags below are allowed to be embedded.
   if (C.getDriver().embedBitcodeInObject() && !C.getDriver().isUsingLTO() &&
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -686,6 +686,11 @@
       if (!Args.hasArg(options::OPT_shared) && !TC.getTriple().isAndroid())
         HelperStaticRuntimes.push_back("asan-preinit");
     }
+    if (SanArgs.needsHeapProfRt() && SanArgs.linkRuntimes()) {
+      SharedRuntimes.push_back("heapprof");
+      if (!Args.hasArg(options::OPT_shared) && !TC.getTriple().isAndroid())
+        HelperStaticRuntimes.push_back("heapprof-preinit");
+    }
     if (SanArgs.needsUbsanRt() && SanArgs.linkRuntimes()) {
       if (SanArgs.requiresMinimalRuntime())
         SharedRuntimes.push_back("ubsan_minimal");
@@ -721,6 +726,13 @@
       StaticRuntimes.push_back("asan_cxx");
   }
 
+  if (!SanArgs.needsSharedRt() && SanArgs.needsHeapProfRt() &&
+      SanArgs.linkRuntimes()) {
+    StaticRuntimes.push_back("heapprof");
+    if (SanArgs.linkCXXRuntimes())
+      StaticRuntimes.push_back("heapprof_cxx");
+  }
+
   if (!SanArgs.needsSharedRt() && SanArgs.needsHwasanRt() && SanArgs.linkRuntimes()) {
     StaticRuntimes.push_back("hwasan");
     if (SanArgs.linkCXXRuntimes())
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1033,6 +1033,8 @@
   Opts.ThinLinkBitcodeFile =
       std::string(Args.getLastArgValue(OPT_fthin_link_bitcode_EQ));
 
+  Opts.HeapProf = Args.hasArg(OPT_fmemprof);
+
   Opts.MSVolatile = Args.hasArg(OPT_fms_volatile);
 
   Opts.VectorizeLoop = Args.hasArg(OPT_vectorize_loops);
diff --git a/clang/test/Driver/fmemprof.cpp b/clang/test/Driver/fmemprof.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/Driver/fmemprof.cpp
@@ -0,0 +1,6 @@
+// RUN: %clangxx -target x86_64-linux-gnu -fmemprof %s -### 2>&1 | FileCheck %s
+// RUN: %clangxx -target x86_64-linux-gnu -fmemprof -fno-memprof %s -### 2>&1 | FileCheck %s --check-prefix=OFF
+// CHECK: "-cc1" {{.*}} "-fmemprof"
+// CHECK: ld{{.*}}libclang_rt.heapprof{{.*}}libclang_rt.heapprof_cxx
+// OFF-NOT: "-fmemprof"
+// OFF-NOT: libclang_rt.heapprof
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -176,6 +176,7 @@
 void initializeGlobalsAAWrapperPassPass(PassRegistry&);
 void initializeGuardWideningLegacyPassPass(PassRegistry&);
 void initializeHardwareLoopsPass(PassRegistry&);
+void initializeHeapProfilerLegacyPassPass(PassRegistry &);
 void initializeHotColdSplittingLegacyPassPass(PassRegistry&);
 void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &);
 void initializeIPSCCPLegacyPassPass(PassRegistry&);
@@ -303,6 +304,7 @@
 void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);
 void initializeMetaRenamerPass(PassRegistry&);
 void initializeModuleDebugInfoPrinterPass(PassRegistry&);
+void initializeModuleHeapProfilerLegacyPassPass(PassRegistry &);
 void initializeModuleSummaryIndexWrapperPassPass(PassRegistry&);
 void initializeModuloScheduleTestPass(PassRegistry&);
 void initializeMustExecutePrinterPass(PassRegistry&);
diff --git a/llvm/include/llvm/Transforms/Instrumentation/HeapProfiler.h b/llvm/include/llvm/Transforms/Instrumentation/HeapProfiler.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Instrumentation/HeapProfiler.h
@@ -0,0 +1,51 @@
+//===--------- Definition of the HeapProfiler class ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the HeapProfiler class.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_HEAPPROFILER_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_HEAPPROFILER_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+/// Public interface to the heap profiler pass for instrumenting code to
+/// profile heap memory accesses.
+///
+/// The profiler itself is a function pass that works by inserting various
+/// calls to the HeapProfiler runtime library functions. The runtime library
+/// essentially replaces malloc() and free() with custom implementations that
+/// record data about the allocations.
+class HeapProfilerPass : public PassInfoMixin<HeapProfilerPass> {
+public:
+  explicit HeapProfilerPass();
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// Public interface to the heap profiler module pass for instrumenting code
+/// to profile heap memory allocations and accesses.
+class ModuleHeapProfilerPass : public PassInfoMixin<ModuleHeapProfilerPass> {
+public:
+  explicit ModuleHeapProfilerPass();
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+};
+
+// Insert HeapProfiler instrumentation
+FunctionPass *createHeapProfilerFunctionPass();
+ModulePass *createModuleHeapProfilerLegacyPassPass();
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -110,6 +110,7 @@
 #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
+#include "llvm/Transforms/Instrumentation/HeapProfiler.h"
 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
@@ -258,6 +259,10 @@
                             cl::Hidden,
                             cl::desc("Enable inline deferral during PGO"));
 
+static cl::opt<bool> EnableHeapProfiler("enable-heap-prof", cl::init(false),
+                                        cl::Hidden, cl::ZeroOrMore,
+                                        cl::desc("Enable heap profiler"));
+
 PipelineTuningOptions::PipelineTuningOptions() {
   LoopInterleaving = true;
   LoopVectorization = true;
@@ -1034,6 +1039,12 @@
     MPM.addPass(SyntheticCountsPropagation());
 
   MPM.addPass(buildInlinerPipeline(Level, Phase, DebugLogging));
+
+  if (EnableHeapProfiler && Phase != ThinLTOPhase::PreLink) {
+    MPM.addPass(createModuleToFunctionPassAdaptor(HeapProfilerPass()));
+    MPM.addPass(ModuleHeapProfilerPass());
+  }
+
   return MPM;
 }
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -97,6 +97,7 @@
 MODULE_PASS("tsan-module", ThreadSanitizerPass())
 MODULE_PASS("kasan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/true, false, true, false))
 MODULE_PASS("sancov-module", ModuleSanitizerCoveragePass())
+MODULE_PASS("heapprof-module", ModuleHeapProfilerPass())
 MODULE_PASS("poison-checking", PoisonCheckingPass())
 #undef MODULE_PASS
 
@@ -276,6 +277,7 @@
 FUNCTION_PASS("msan", MemorySanitizerPass({}))
 FUNCTION_PASS("kmsan", MemorySanitizerPass({0, false, /*Kernel=*/true}))
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
+FUNCTION_PASS("heapprof", HeapProfilerPass())
 #undef FUNCTION_PASS
 
 #ifndef FUNCTION_PASS_WITH_PARAMS
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -5,6 +5,7 @@
   ControlHeightReduction.cpp
   DataFlowSanitizer.cpp
   GCOVProfiling.cpp
+  HeapProfiler.cpp
   MemorySanitizer.cpp
   IndirectCallPromotion.cpp
   Instrumentation.cpp
diff --git a/llvm/lib/Transforms/Instrumentation/HeapProfiler.cpp b/llvm/lib/Transforms/Instrumentation/HeapProfiler.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/HeapProfiler.cpp
@@ -0,0 +1,613 @@
+//===- HeapProfiler.cpp - heap allocation and access profiler
+//--------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of HeapProfiler. Memory accesses are instrumented
+// to increment the access count held in a shadow memory location, or
+// alternatively to call into the runtime. Memory intrinsic calls (memmove,
+// memcpy, memset) are changed to call the heap profiling runtime version
+// instead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/HeapProfiler.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "heapprof"
+
+constexpr int LLVM_HEAP_PROFILER_VERSION = 1;
+
+// Size of memory mapped to a single shadow location.
+constexpr uint64_t DefaultShadowGranularity = 64;
+
+// Scale from granularity down to shadow size.
+constexpr uint64_t DefaultShadowScale = 3;
+
+constexpr char HeapProfModuleCtorName[] = "heapprof.module_ctor";
+constexpr uint64_t HeapProfCtorAndDtorPriority = 1;
+// On Emscripten, the system needs more than one priorities for constructors.
+constexpr uint64_t HeapProfEmscriptenCtorAndDtorPriority = 50;
+constexpr char HeapProfInitName[] = "__heapprof_init";
+constexpr char HeapProfVersionCheckNamePrefix[] =
+    "__heapprof_version_mismatch_check_v";
+
+constexpr char HeapProfShadowMemoryDynamicAddress[] =
+    "__heapprof_shadow_memory_dynamic_address";
+
+// Command-line flags.
+
+static cl::opt<bool> ClInsertVersionCheck(
+    "heapprof-guard-against-version-mismatch",
+    cl::desc("Guard against compiler/runtime version mismatch."), cl::Hidden,
+    cl::init(true));
+
+// This flag may need to be replaced with -f[no-]memprof-reads.
+static cl::opt<bool> ClInstrumentReads("heapprof-instrument-reads",
+                                       cl::desc("instrument read instructions"),
+                                       cl::Hidden, cl::init(true));
+
+static cl::opt<bool>
+    ClInstrumentWrites("heapprof-instrument-writes",
+                       cl::desc("instrument write instructions"), cl::Hidden,
+                       cl::init(true));
+
+static cl::opt<bool> ClInstrumentAtomics(
+    "heapprof-instrument-atomics",
+    cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<bool> ClUseCalls(
+    "heapprof-use-callbacks",
+    cl::desc("Use callbacks instead of inline instrumentation sequences."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<std::string>
+    ClMemoryAccessCallbackPrefix("heapprof-memory-access-callback-prefix",
+                                 cl::desc("Prefix for memory access callbacks"),
+                                 cl::Hidden, cl::init("__heapprof_"));
+
+// These flags allow to change the shadow mapping.
+// The shadow mapping looks like
+//    Shadow = ((Mem & mask) >> scale) + offset
+
+static cl::opt<int> ClMappingScale("heapprof-mapping-scale",
+                                   cl::desc("scale of heapprof shadow mapping"),
+                                   cl::Hidden, cl::init(DefaultShadowScale));
+
+static cl::opt<int>
+    ClMappingGranularity("heapprof-mapping-granularity",
+                         cl::desc("granularity of heapprof shadow mapping"),
+                         cl::Hidden, cl::init(DefaultShadowGranularity));
+
+// Debug flags.
+
+static cl::opt<int> ClDebug("heapprof-debug", cl::desc("debug"), cl::Hidden,
+                            cl::init(0));
+
+static cl::opt<std::string> ClDebugFunc("heapprof-debug-func", cl::Hidden,
+                                        cl::desc("Debug func"));
+
+static cl::opt<int> ClDebugMin("heapprof-debug-min", cl::desc("Debug min inst"),
+                               cl::Hidden, cl::init(-1));
+
+static cl::opt<int> ClDebugMax("heapprof-debug-max", cl::desc("Debug max inst"),
+                               cl::Hidden, cl::init(-1));
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+
+namespace {
+
+/// This struct defines the shadow mapping using the rule:
+///   shadow = ((mem & mask) >> Scale) ADD DynamicShadowOffset.
+struct ShadowMapping {
+  ShadowMapping() {
+    Scale = ClMappingScale;
+    Granularity = ClMappingGranularity;
+    Mask = ~(Granularity - 1);
+  }
+
+  int Scale;
+  int Granularity;
+  uint64_t Mask; // Computed as ~(Granularity-1)
+};
+
+static uint64_t getCtorAndDtorPriority(Triple &TargetTriple) {
+  return TargetTriple.isOSEmscripten() ? HeapProfEmscriptenCtorAndDtorPriority
+                                       : HeapProfCtorAndDtorPriority;
+}
+
+struct InterestingMemoryAccess {
+  Value *Addr = nullptr;
+  bool IsWrite;
+  unsigned Alignment;
+  uint64_t TypeSize;
+  Value *MaybeMask = nullptr;
+};
+
+/// Instrument the code in module to profile heap accesses.
+class HeapProfiler {
+public:
+  HeapProfiler(Module &M) {
+    C = &(M.getContext());
+    LongSize = M.getDataLayout().getPointerSizeInBits();
+    IntptrTy = Type::getIntNTy(*C, LongSize);
+  }
+
+  /// If it is an interesting memory access, populate information
+  /// about the access and return a InterestingMemoryAccess struct.
+  /// Otherwise return None.
+  Optional<InterestingMemoryAccess> isInterestingMemoryAccess(Instruction *I);
+
+  void instrumentMop(Instruction *I, const DataLayout &DL,
+                     InterestingMemoryAccess &Access);
+  void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
+                         Value *Addr, uint32_t TypeSize, bool IsWrite);
+  void instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask,
+                                   Instruction *I, Value *Addr,
+                                   unsigned Alignment, uint32_t TypeSize,
+                                   bool IsWrite);
+  void instrumentMemIntrinsic(MemIntrinsic *MI);
+  Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+  bool instrumentFunction(Function &F);
+  bool maybeInsertHeapProfInitAtFunctionEntry(Function &F);
+  void insertDynamicShadowAtFunctionEntry(Function &F);
+
+private:
+  void initializeCallbacks(Module &M);
+
+  LLVMContext *C;
+  int LongSize;
+  Type *IntptrTy;
+  ShadowMapping Mapping;
+
+  // These arrays is indexed by AccessIsWrite
+  FunctionCallee HeapProfMemoryAccessCallback[2];
+  FunctionCallee HeapProfMemoryAccessCallbackSized[2];
+
+  FunctionCallee HeapProfMemmove, HeapProfMemcpy, HeapProfMemset;
+  Value *DynamicShadowOffset = nullptr;
+};
+
+class HeapProfilerLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  explicit HeapProfilerLegacyPass() : FunctionPass(ID) {
+    initializeHeapProfilerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "HeapProfilerFunctionPass"; }
+
+  bool runOnFunction(Function &F) override {
+    HeapProfiler Profiler(*F.getParent());
+    return Profiler.instrumentFunction(F);
+  }
+};
+
+class ModuleHeapProfiler {
+public:
+  ModuleHeapProfiler(Module &M) { TargetTriple = Triple(M.getTargetTriple()); }
+
+  bool instrumentModule(Module &);
+
+private:
+  Triple TargetTriple;
+  ShadowMapping Mapping;
+  Function *HeapProfCtorFunction = nullptr;
+};
+
+class ModuleHeapProfilerLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  explicit ModuleHeapProfilerLegacyPass() : ModulePass(ID) {
+    initializeModuleHeapProfilerLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "ModuleHeapProfiler"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {}
+
+  bool runOnModule(Module &M) override {
+    ModuleHeapProfiler HeapProfiler(M);
+    return HeapProfiler.instrumentModule(M);
+  }
+};
+
+} // end anonymous namespace
+
+HeapProfilerPass::HeapProfilerPass() {}
+
+PreservedAnalyses HeapProfilerPass::run(Function &F,
+                                        AnalysisManager<Function> &AM) {
+  Module &M = *F.getParent();
+  HeapProfiler Profiler(M);
+  if (Profiler.instrumentFunction(F))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+
+  return PreservedAnalyses::all();
+}
+
+ModuleHeapProfilerPass::ModuleHeapProfilerPass() {}
+
+PreservedAnalyses ModuleHeapProfilerPass::run(Module &M,
+                                              AnalysisManager<Module> &AM) {
+  ModuleHeapProfiler Profiler(M);
+  if (Profiler.instrumentModule(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+char HeapProfilerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HeapProfilerLegacyPass, "heapprof",
+                      "HeapProfiler: profile heap allocations and accesses.",
+                      false, false)
+INITIALIZE_PASS_END(HeapProfilerLegacyPass, "heapprof",
+                    "HeapProfiler: profile heap allocations and accesses.",
+                    false, false)
+
+FunctionPass *llvm::createHeapProfilerFunctionPass() {
+  return new HeapProfilerLegacyPass();
+}
+
+char ModuleHeapProfilerLegacyPass::ID = 0;
+
+INITIALIZE_PASS(ModuleHeapProfilerLegacyPass, "heapprof-module",
+                "HeapProfiler: profile heap allocations and accesses."
+                "ModulePass",
+                false, false)
+
+ModulePass *llvm::createModuleHeapProfilerLegacyPassPass() {
+  return new ModuleHeapProfilerLegacyPass();
+}
+
+Value *HeapProfiler::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
+  // (Shadow & mask) >> scale
+  Shadow = IRB.CreateAnd(Shadow, Mapping.Mask);
+  Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
+  // (Shadow >> scale) | offset
+  assert(DynamicShadowOffset);
+  return IRB.CreateAdd(Shadow, DynamicShadowOffset);
+}
+
+// Instrument memset/memmove/memcpy
+void HeapProfiler::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  IRBuilder<> IRB(MI);
+  if (isa<MemTransferInst>(MI)) {
+    IRB.CreateCall(
+        isa<MemMoveInst>(MI) ? HeapProfMemmove : HeapProfMemcpy,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  } else if (isa<MemSetInst>(MI)) {
+    IRB.CreateCall(
+        HeapProfMemset,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  }
+  MI->eraseFromParent();
+}
+
+Optional<InterestingMemoryAccess>
+HeapProfiler::isInterestingMemoryAccess(Instruction *I) {
+  // Do not instrument the load fetching the dynamic shadow address.
+  if (DynamicShadowOffset == I)
+    return None;
+
+  InterestingMemoryAccess Access;
+
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (!ClInstrumentReads)
+      return None;
+    Access.IsWrite = false;
+    Access.TypeSize = DL.getTypeStoreSizeInBits(LI->getType());
+    Access.Alignment = LI->getAlignment();
+    Access.Addr = LI->getPointerOperand();
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (!ClInstrumentWrites)
+      return None;
+    Access.IsWrite = true;
+    Access.TypeSize =
+        DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType());
+    Access.Alignment = SI->getAlignment();
+    Access.Addr = SI->getPointerOperand();
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    if (!ClInstrumentAtomics)
+      return None;
+    Access.IsWrite = true;
+    Access.TypeSize =
+        DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType());
+    Access.Alignment = 0;
+    Access.Addr = RMW->getPointerOperand();
+  } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!ClInstrumentAtomics)
+      return None;
+    Access.IsWrite = true;
+    Access.TypeSize =
+        DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
+    Access.Alignment = 0;
+    Access.Addr = XCHG->getPointerOperand();
+  } else if (auto *CI = dyn_cast<CallInst>(I)) {
+    auto *F = CI->getCalledFunction();
+    if (F && (F->getIntrinsicID() == Intrinsic::masked_load ||
+              F->getIntrinsicID() == Intrinsic::masked_store)) {
+      unsigned OpOffset = 0;
+      if (F->getIntrinsicID() == Intrinsic::masked_store) {
+        if (!ClInstrumentWrites)
+          return None;
+        // Masked store has an initial operand for the value.
+        OpOffset = 1;
+        Access.IsWrite = true;
+      } else {
+        if (!ClInstrumentReads)
+          return None;
+        Access.IsWrite = false;
+      }
+
+      auto *BasePtr = CI->getOperand(0 + OpOffset);
+      auto *Ty = cast<PointerType>(BasePtr->getType())->getElementType();
+      Access.TypeSize = DL.getTypeStoreSizeInBits(Ty);
+      if (auto *AlignmentConstant =
+              dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
+        Access.Alignment = (unsigned)AlignmentConstant->getZExtValue();
+      else
+        Access.Alignment = 1; // No alignment guarantees. We probably got Undef
+      Access.MaybeMask = CI->getOperand(2 + OpOffset);
+      Access.Addr = BasePtr;
+    }
+  }
+
+  if (!Access.Addr)
+    return None;
+
+  // Do not instrument acesses from different address spaces; we cannot deal
+  // with them.
+  Type *PtrTy = cast<PointerType>(Access.Addr->getType()->getScalarType());
+  if (PtrTy->getPointerAddressSpace() != 0)
+    return None;
+
+  // Ignore swifterror addresses.
+  // swifterror memory addresses are mem2reg promoted by instruction
+  // selection. As such they cannot have regular uses like an instrumentation
+  // function and it makes no sense to track them as memory.
+  if (Access.Addr->isSwiftError())
+    return None;
+
+  return Access;
+}
+
+void HeapProfiler::instrumentMaskedLoadOrStore(const DataLayout &DL,
+                                               Value *Mask, Instruction *I,
+                                               Value *Addr, unsigned Alignment,
+                                               uint32_t TypeSize,
+                                               bool IsWrite) {
+  auto *VTy =
+      cast<VectorType>(cast<PointerType>(Addr->getType())->getElementType());
+  uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
+  unsigned Num = VTy->getNumElements();
+  auto *Zero = ConstantInt::get(IntptrTy, 0);
+  for (unsigned Idx = 0; Idx < Num; ++Idx) {
+    Value *InstrumentedAddress = nullptr;
+    Instruction *InsertBefore = I;
+    if (auto *Vector = dyn_cast<ConstantVector>(Mask)) {
+      // dyn_cast as we might get UndefValue
+      if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) {
+        if (Masked->isZero())
+          // Mask is constant false, so no instrumentation needed.
+          continue;
+        // If we have a true or undef value, fall through to instrumentAddress.
+        // with InsertBefore == I
+      }
+    } else {
+      IRBuilder<> IRB(I);
+      Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
+      Instruction *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
+      InsertBefore = ThenTerm;
+    }
+
+    IRBuilder<> IRB(InsertBefore);
+    InstrumentedAddress =
+        IRB.CreateGEP(VTy, Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
+    instrumentAddress(I, InsertBefore, InstrumentedAddress, ElemTypeSize,
+                      IsWrite);
+  }
+}
+
+void HeapProfiler::instrumentMop(Instruction *I, const DataLayout &DL,
+                                 InterestingMemoryAccess &Access) {
+  if (Access.IsWrite)
+    NumInstrumentedWrites++;
+  else
+    NumInstrumentedReads++;
+
+  if (Access.MaybeMask) {
+    instrumentMaskedLoadOrStore(DL, Access.MaybeMask, I, Access.Addr,
+                                Access.Alignment, Access.TypeSize,
+                                Access.IsWrite);
+  } else {
+    // Since the access counts will be accumulated across the entire allocation,
+    // we only update the shadow access count for the first location and thus
+    // don't need to worry about alignment and type size.
+    instrumentAddress(I, I, Access.Addr, Access.TypeSize, Access.IsWrite);
+  }
+}
+
+void HeapProfiler::instrumentAddress(Instruction *OrigIns,
+                                     Instruction *InsertBefore, Value *Addr,
+                                     uint32_t TypeSize, bool IsWrite) {
+  IRBuilder<> IRB(InsertBefore);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+
+  if (ClUseCalls) {
+    IRB.CreateCall(HeapProfMemoryAccessCallback[IsWrite], AddrLong);
+    return;
+  }
+
+  // Create an inline sequence to compute shadow location, and increment the
+  // value by one.
+  Type *ShadowTy = Type::getInt64Ty(*C);
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Value *ShadowPtr = memToShadow(AddrLong, IRB);
+  Value *ShadowAddr = IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy);
+  Value *ShadowValue = IRB.CreateLoad(ShadowTy, ShadowAddr);
+  Value *Inc = ConstantInt::get(Type::getInt64Ty(*C), 1);
+  ShadowValue = IRB.CreateAdd(ShadowValue, Inc);
+  IRB.CreateStore(ShadowValue, ShadowAddr);
+}
+
+bool ModuleHeapProfiler::instrumentModule(Module &M) {
+  // Create a module constructor.
+  std::string HeapProfVersion = std::to_string(LLVM_HEAP_PROFILER_VERSION);
+  std::string VersionCheckName =
+      ClInsertVersionCheck ? (HeapProfVersionCheckNamePrefix + HeapProfVersion)
+                           : "";
+  std::tie(HeapProfCtorFunction, std::ignore) =
+      createSanitizerCtorAndInitFunctions(M, HeapProfModuleCtorName,
+                                          HeapProfInitName, /*InitArgTypes=*/{},
+                                          /*InitArgs=*/{}, VersionCheckName);
+
+  const uint64_t Priority = getCtorAndDtorPriority(TargetTriple);
+  appendToGlobalCtors(M, HeapProfCtorFunction, Priority);
+
+  return true;
+}
+
+void HeapProfiler::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+
+  for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+    const std::string TypeStr = AccessIsWrite ? "store" : "load";
+
+    SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
+    SmallVector<Type *, 2> Args1{1, IntptrTy};
+    HeapProfMemoryAccessCallbackSized[AccessIsWrite] =
+        M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + TypeStr + "N",
+                              FunctionType::get(IRB.getVoidTy(), Args2, false));
+
+    HeapProfMemoryAccessCallback[AccessIsWrite] =
+        M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + TypeStr,
+                              FunctionType::get(IRB.getVoidTy(), Args1, false));
+  }
+  HeapProfMemmove = M.getOrInsertFunction(
+      ClMemoryAccessCallbackPrefix + "memmove", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy);
+  HeapProfMemcpy = M.getOrInsertFunction(
+      ClMemoryAccessCallbackPrefix + "memcpy", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy);
+  HeapProfMemset = M.getOrInsertFunction(
+      ClMemoryAccessCallbackPrefix + "memset", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy);
+}
+
+bool HeapProfiler::maybeInsertHeapProfInitAtFunctionEntry(Function &F) {
+  // For each NSObject descendant having a +load method, this method is invoked
+  // by the ObjC runtime before any of the static constructors is called.
+  // Therefore we need to instrument such methods with a call to __heapprof_init
+  // at the beginning in order to initialize our runtime before any access to
+  // the shadow memory.
+  // We cannot just ignore these methods, because they may call other
+  // instrumented functions.
+  if (F.getName().find(" load]") != std::string::npos) {
+    FunctionCallee HeapProfInitFunction =
+        declareSanitizerInitFunction(*F.getParent(), HeapProfInitName, {});
+    IRBuilder<> IRB(&F.front(), F.front().begin());
+    IRB.CreateCall(HeapProfInitFunction, {});
+    return true;
+  }
+  return false;
+}
+
+void HeapProfiler::insertDynamicShadowAtFunctionEntry(Function &F) {
+  IRBuilder<> IRB(&F.front().front());
+  Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal(
+      HeapProfShadowMemoryDynamicAddress, IntptrTy);
+  DynamicShadowOffset = IRB.CreateLoad(IntptrTy, GlobalDynamicAddress);
+}
+
+bool HeapProfiler::instrumentFunction(Function &F) {
+  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
+    return false;
+  if (ClDebugFunc == F.getName())
+    return false;
+  if (F.getName().startswith("__heapprof_"))
+    return false;
+
+  bool FunctionModified = false;
+
+  // If needed, insert __heapprof_init.
+  // This function needs to be called even if the function body is not
+  // instrumented.
+  if (maybeInsertHeapProfInitAtFunctionEntry(F))
+    FunctionModified = true;
+
+  LLVM_DEBUG(dbgs() << "HEAPPROF instrumenting:\n" << F << "\n");
+
+  initializeCallbacks(*F.getParent());
+
+  insertDynamicShadowAtFunctionEntry(F);
+
+  SmallVector<Instruction *, 16> ToInstrument;
+
+  // Fill the set of memory operations to instrument.
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      if (isInterestingMemoryAccess(&Inst) || isa<MemIntrinsic>(Inst))
+        ToInstrument.push_back(&Inst);
+    }
+  }
+
+  int NumInstrumented = 0;
+  for (auto *Inst : ToInstrument) {
+    if (ClDebugMin < 0 || ClDebugMax < 0 ||
+        (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
+      Optional<InterestingMemoryAccess> Access =
+          isInterestingMemoryAccess(Inst);
+      if (Access)
+        instrumentMop(Inst, F.getParent()->getDataLayout(), *Access);
+      else
+        instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+    }
+    NumInstrumented++;
+  }
+
+  if (NumInstrumented > 0)
+    FunctionModified = true;
+
+  LLVM_DEBUG(dbgs() << "HEAPPROF done instrumenting: " << FunctionModified
+                    << " " << F << "\n");
+
+  return FunctionModified;
+}
diff --git a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
--- a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -105,6 +105,8 @@
 void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeAddressSanitizerLegacyPassPass(Registry);
   initializeModuleAddressSanitizerLegacyPassPass(Registry);
+  initializeHeapProfilerLegacyPassPass(Registry);
+  initializeModuleHeapProfilerLegacyPassPass(Registry);
   initializeBoundsCheckingLegacyPassPass(Registry);
   initializeControlHeightReductionLegacyPassPass(Registry);
   initializeGCOVProfilerLegacyPassPass(Registry);
diff --git a/llvm/test/Instrumentation/HeapProfiler/basic.ll b/llvm/test/Instrumentation/HeapProfiler/basic.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Instrumentation/HeapProfiler/basic.ll
@@ -0,0 +1,179 @@
+; Test basic address sanitizer instrumentation.
+;
+; RUN: opt < %s -heapprof -heapprof-module -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s
+
+; We need the requires since both heapprof and heapprof-module require reading module level metadata which is done once by the heapprof-globals-md analysis
+; RUN: opt < %s -passes='function(heapprof),module(heapprof-module)' -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s
+; RUN: opt < %s -passes='function(heapprof),module(heapprof-module)' -heapprof-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+; CHECK: @llvm.global_ctors = {{.*}}@heapprof.module_ctor
+
+define i32 @test_load(i32* %a) {
+entry:
+  %tmp1 = load i32, i32* %a, align 4
+  ret i32 %tmp1
+}
+; CHECK-LABEL: @test_load
+; CHECK:         %[[SHADOW_OFFSET:[^ ]*]] = load i64, i64* @__heapprof_shadow_memory_dynamic_address
+; CHECK-NEXT:    %[[LOAD_ADDR:[^ ]*]] = ptrtoint i32* %a to i64
+; CHECK-NEXT:    %[[MASKED_ADDR:[^ ]*]] = and i64 %[[LOAD_ADDR]], -64
+; CHECK-S3-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 3
+; CHECK-S5-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 5
+; CHECK-NEXT:    add i64 %[[SHIFTED_ADDR]], %[[SHADOW_OFFSET]]
+; CHECK-NEXT:    %[[LOAD_SHADOW_PTR:[^ ]*]] = inttoptr
+; CHECK-NEXT:    %[[LOAD_SHADOW:[^ ]*]] = load i64, i64* %[[LOAD_SHADOW_PTR]]
+; CHECK-NEXT:    %[[NEW_SHADOW:[^ ]*]] = add i64 %[[LOAD_SHADOW]], 1
+; CHECK-NEXT:    store i64 %[[NEW_SHADOW]], i64* %[[LOAD_SHADOW_PTR]]
+; The actual load.
+; CHECK-NEXT:    %tmp1 = load i32, i32* %a
+; CHECK-NEXT:    ret i32 %tmp1
+
+define void @test_store(i32* %a) {
+entry:
+  store i32 42, i32* %a, align 4
+  ret void
+}
+; CHECK-LABEL: @test_store
+; CHECK:         %[[SHADOW_OFFSET:[^ ]*]] = load i64, i64* @__heapprof_shadow_memory_dynamic_address
+; CHECK-NEXT:    %[[STORE_ADDR:[^ ]*]] = ptrtoint i32* %a to i64
+; CHECK-NEXT:    %[[MASKED_ADDR:[^ ]*]] = and i64 %[[STORE_ADDR]], -64
+; CHECK-S3-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 3
+; CHECK-S5-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 5
+; CHECK-NEXT:    add i64 %[[SHIFTED_ADDR]], %[[SHADOW_OFFSET]]
+; CHECK-NEXT:    %[[STORE_SHADOW_PTR:[^ ]*]] = inttoptr
+; CHECK-NEXT:    %[[STORE_SHADOW:[^ ]*]] = load i64, i64* %[[STORE_SHADOW_PTR]]
+; CHECK-NEXT:    %[[NEW_SHADOW:[^ ]*]] = add i64 %[[STORE_SHADOW]], 1
+; CHECK-NEXT:    store i64 %[[NEW_SHADOW]], i64* %[[STORE_SHADOW_PTR]]
+; The actual store.
+; CHECK-NEXT:    store i32 42, i32* %a
+; CHECK-NEXT:    ret void
+
+define void @FP80Test(x86_fp80* nocapture %a) nounwind uwtable {
+entry:
+    store x86_fp80 0xK3FFF8000000000000000, x86_fp80* %a, align 16
+    ret void
+}
+; CHECK-LABEL: @FP80Test
+; Exactly one shadow update for store access.
+; CHECK-NOT:  store i64
+; CHECK:      %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual store.
+; CHECK:      store x86_fp80 0xK3FFF8000000000000000, x86_fp80* %a
+; CHECK:      ret void
+
+define void @i40test(i40* %a, i40* %b) nounwind uwtable {
+entry:
+  %t = load i40, i40* %a
+  store i40 %t, i40* %b, align 8
+  ret void
+}
+; CHECK-LABEL: @i40test
+; Exactly one shadow update for load access.
+; CHECK-NOT:  store i64
+; CHECK:      %[[NEW_LD_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_LD_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual load.
+; CHECK:      %t = load i40, i40* %a
+; Exactly one shadow update for store access.
+; CHECK-NOT:  store i64
+; CHECK:      %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual store.
+; CHECK:      store i40 %t, i40* %b
+; CHECK:      ret void
+
+define void @i64test_align1(i64* %b) nounwind uwtable {
+  entry:
+  store i64 0, i64* %b, align 1
+  ret void
+}
+; CHECK-LABEL: @i64test
+; Exactly one shadow update for store access.
+; CHECK-NOT:  store i64
+; CHECK: %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual store.
+; CHECK:      store i64 0, i64* %b
+; CHECK:      ret void
+
+define void @i80test(i80* %a, i80* %b) nounwind uwtable {
+  entry:
+  %t = load i80, i80* %a
+  store i80 %t, i80* %b, align 8
+  ret void
+}
+; CHECK-LABEL: i80test
+; Exactly one shadow update for load access.
+; CHECK-NOT:  store i64
+; CHECK:      %[[NEW_LD_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_LD_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual load.
+; CHECK:      %t = load i80, i80* %a
+; Exactly one shadow update for store access.
+; CHECK-NOT:  store i64
+; CHECK:      %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual store.
+; CHECK:      store i80 %t, i80* %b
+; CHECK:      ret void
+
+; heapprof should not instrument functions with available_externally linkage.
+define available_externally i32 @f_available_externally(i32* %a)  {
+entry:
+  %tmp1 = load i32, i32* %a
+  ret i32 %tmp1
+}
+; CHECK-LABEL: @f_available_externally
+; CHECK-NOT: __heapprof_shadow_memory_dynamic_address
+; CHECK: ret i32
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) nounwind
+
+define void @memintr_test(i8* %a, i8* %b) nounwind uwtable {
+  entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 100, i1 false)
+  tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %a, i8* %b, i64 100, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 100, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: memintr_test
+; CHECK: __heapprof_memset
+; CHECK: __heapprof_memmove
+; CHECK: __heapprof_memcpy
+; CHECK: ret void
+
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+
+define void @memintr_element_atomic_test(i8* %a, i8* %b) nounwind uwtable {
+  ; This is a canary test to make sure that these don't get lowered into calls that don't
+  ; have the element-atomic property. Eventually, heapprof will have to be enhanced to lower
+  ; these properly.
+  ; CHECK-LABEL: memintr_element_atomic_test
+  ; CHECK: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %a, i8 0, i64 100, i32 1)
+  ; CHECK: tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  ; CHECK: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  ; CHECK: ret void
+  tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %a, i8 0, i64 100, i32 1)
+  tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  ret void
+}
+
+
+; CHECK: define internal void @heapprof.module_ctor()
+; CHECK: call void @__heapprof_init()
diff --git a/llvm/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll b/llvm/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll
@@ -0,0 +1,36 @@
+; Test heapprof internal compiler flags:
+;   -heapprof-use-callbacks
+;   -heapprof-memory-access-callback-prefix
+
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-use-callbacks -S | FileCheck %s --check-prefix=CHECK-CALL --check-prefix=CHECK-CALL-DEFAULT
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-use-callbacks -heapprof-memory-access-callback-prefix=__foo_ -S | FileCheck %s --check-prefix=CHECK-CALL --check-prefix=CHECK-CALL-CUSTOM
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-use-callbacks=false -S | FileCheck %s --check-prefix=CHECK-INLINE
+; RUN: opt < %s -heapprof -heapprof-module  -S | FileCheck %s --check-prefix=CHECK-INLINE
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test_load(i32* %a, i64* %b, i512* %c, i80* %d) {
+entry:
+; CHECK-CALL:             %[[LOAD_ADDR1:[^ ]*]] = ptrtoint i32* %a to i64
+; CHECK-CALL-DEFAULT:     call void @__heapprof_load(i64 %[[LOAD_ADDR1]])
+; CHECK-CALL-CUSTOM:      call void @__foo_load(i64 %[[LOAD_ADDR1]])
+; CHECK-CALL:             %[[LOAD_ADDR2:[^ ]*]] = ptrtoint i64* %b to i64
+; CHECK-CALL-DEFAULT:     call void @__heapprof_load(i64 %[[LOAD_ADDR2]])
+; CHECK-CALL-CUSTOM:      call void @__foo_load(i64 %[[LOAD_ADDR2]])
+; CHECK-CALL:             %[[LOAD_ADDR3:[^ ]*]] = ptrtoint i512* %c to i64
+; CHECK-CALL-DEFAULT:     call void @__heapprof_load(i64 %[[LOAD_ADDR3]])
+; CHECK-CALL-CUSTOM:      call void @__foo_load(i64 %[[LOAD_ADDR3]])
+; CHECK-CALL:             %[[LOAD_ADDR4:[^ ]*]] = ptrtoint i80* %d to i64
+; CHECK-CALL-DEFAULT:     call void @__heapprof_load(i64 %[[LOAD_ADDR4]])
+; CHECK-CALL-CUSTOM:      call void @__foo_load(i64 %[[LOAD_ADDR4]])
+; CHECK-CALL-DEFAULT-NOT: call void @__heapprof_load
+; CHECK-CALL-CUSTOM-NOT:  call void @__foo_load
+; CHECK-INLINE-NOT:       call void @__heapprof_load
+  %tmp1 = load i32, i32* %a, align 4
+  %tmp2 = load i64, i64* %b, align 8
+  %tmp3 = load i512, i512* %c, align 32
+  %tmp4 = load i80, i80* %d, align 8
+  ret void
+}
+
+
diff --git a/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll
@@ -0,0 +1,246 @@
+; RUN: opt < %s -heapprof -heapprof-use-callbacks -S \
+; RUN:     | FileCheck %s -check-prefix=LOAD -check-prefix=STORE -check-prefix=ALL
+; RUN: opt < %s -heapprof -heapprof-use-callbacks -heapprof-instrument-reads=0 -S \
+; RUN:     | FileCheck %s -check-prefix=NOLOAD -check-prefix=STORE -check-prefix=ALL
+; RUN: opt < %s -heapprof -heapprof-use-callbacks -heapprof-instrument-writes=0 -S \
+; RUN:     | FileCheck %s -check-prefix=LOAD -check-prefix=NOSTORE -check-prefix=ALL
+; RUN: opt < %s -heapprof -heapprof-use-callbacks -heapprof-instrument-reads=0 -heapprof-instrument-writes=0 -S \
+; RUN:     | FileCheck %s -check-prefix=NOLOAD -check-prefix=NOSTORE -check-prefix=ALL
+; Support heap profiling instrumentation for constant-mask llvm.masked.{load,store}
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@v4f32 = global <4 x float>* zeroinitializer, align 8
+@v8i32 = global <8 x i32>* zeroinitializer, align 8
+@v4i64 = global <4 x i32*>* zeroinitializer, align 8
+
+;;;;;;;;;;;;;;;; STORE
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) argmemonly nounwind
+declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) argmemonly nounwind
+declare void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*>, <4 x i32*>*, i32, <4 x i1>) argmemonly nounwind
+
+define void @store.v4f32.1110(<4 x float> %arg) {
+; ALL-LABEL: @store.v4f32.1110
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; NOSTORE-NOT: call void @__heapprof_store
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP0]])
+; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1
+; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP1]])
+; STORE: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; STORE: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP2]])
+; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
+  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
+  ret void
+}
+
+define void @store.v8i32.10010110(<8 x i32> %arg) {
+; ALL-LABEL: @store.v8i32.10010110
+  %p = load <8 x i32>*, <8 x i32>** @v8i32, align 8
+; NOSTORE-NOT: call void @__heapprof_store
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP0]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP0]])
+; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 3
+; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP3]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP3]])
+; STORE: [[GEP5:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 5
+; STORE: [[PGEP5:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP5]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP5]])
+; STORE: [[GEP6:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 6
+; STORE: [[PGEP6:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP6]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP6]])
+; STORE: tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %arg, <8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
+  tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %arg, <8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
+  ret void
+}
+
+define void @store.v4i64.0001(<4 x i32*> %arg) {
+; ALL-LABEL: @store.v4i64.0001
+  %p = load <4 x i32*>*, <4 x i32*>** @v4i64, align 8
+; NOSTORE-NOT: call void @__heapprof_store
+; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x i32*>, <4 x i32*>* %p, i64 0, i64 3
+; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32** [[GEP3]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP3]])
+; STORE: tail call void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*> %arg, <4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+  tail call void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*> %arg, <4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+  ret void
+}
+
+define void @store.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) {
+; ALL-LABEL: @store.v4f32.variable
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; STORE: [[MASK0:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 0
+; STORE: br i1 [[MASK0]], label %[[THEN0:[0-9A-Za-z]+]], label %[[AFTER0:[0-9A-Za-z]+]]
+; STORE: [[THEN0]]:
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP0]])
+; STORE: br label %[[AFTER0]]
+; STORE: [[AFTER0]]:
+
+; STORE: [[MASK1:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 1
+; STORE: br i1 [[MASK1]], label %[[THEN1:[0-9A-Za-z]+]], label %[[AFTER1:[0-9A-Za-z]+]]
+; STORE: [[THEN1]]:
+; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1
+; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP1]])
+; STORE: br label %[[AFTER1]]
+; STORE: [[AFTER1]]:
+
+; STORE: [[MASK2:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 2
+; STORE: br i1 [[MASK2]], label %[[THEN2:[0-9A-Za-z]+]], label %[[AFTER2:[0-9A-Za-z]+]]
+; STORE: [[THEN2]]:
+; STORE: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; STORE: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP2]])
+; STORE: br label %[[AFTER2]]
+; STORE: [[AFTER2]]:
+
+; STORE: [[MASK3:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 3
+; STORE: br i1 [[MASK3]], label %[[THEN3:[0-9A-Za-z]+]], label %[[AFTER3:[0-9A-Za-z]+]]
+; STORE: [[THEN3]]:
+; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP3]])
+; STORE: br label %[[AFTER3]]
+; STORE: [[AFTER3]]:
+
+; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> %mask)
+  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+;; Store using two masked.stores, which should instrument them both.
+define void @store.v4f32.1010.split(<4 x float> %arg) {
+; BOTH-LABEL: @store.v4f32.1010.split
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP0]])
+; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP1]])
+; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+  ret void
+}
+
+;;;;;;;;;;;;;;;; LOAD
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) argmemonly nounwind
+declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) argmemonly nounwind
+declare <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>*, i32, <4 x i1>, <4 x i32*>) argmemonly nounwind
+
+define <8 x i32> @load.v8i32.11100001(<8 x i32> %arg) {
+; ALL-LABEL: @load.v8i32.11100001
+  %p = load <8 x i32>*, <8 x i32>** @v8i32, align 8
+; NOLOAD-NOT: call void @__heapprof_load
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP0]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP0]])
+; LOAD: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 1
+; LOAD: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP1]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP1]])
+; LOAD: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 2
+; LOAD: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP2]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP2]])
+; LOAD: [[GEP7:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 7
+; LOAD: [[PGEP7:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP7]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP7]])
+; LOAD: tail call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
+  %res = tail call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
+  ret <8 x i32> %res
+}
+
+define <4 x float> @load.v4f32.1001(<4 x float> %arg) {
+; ALL-LABEL: @load.v4f32.1001
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; NOLOAD-NOT: call void @__heapprof_load
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP0]])
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP3]])
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
+  %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
+  ret <4 x float> %res
+}
+
+define <4 x i32*> @load.v4i64.0001(<4 x i32*> %arg) {
+; ALL-LABEL: @load.v4i64.0001
+  %p = load <4 x i32*>*, <4 x i32*>** @v4i64, align 8
+; NOLOAD-NOT: call void @__heapprof_load
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x i32*>, <4 x i32*>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32** [[GEP3]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP3]])
+; LOAD: tail call <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32*> %arg)
+  %res = tail call <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32*> %arg)
+  ret <4 x i32*> %res
+}
+
+define <4 x float> @load.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) {
+; ALL-LABEL: @load.v4f32.variable
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; LOAD: [[MASK0:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 0
+; LOAD: br i1 [[MASK0]], label %[[THEN0:[0-9A-Za-z]+]], label %[[AFTER0:[0-9A-Za-z]+]]
+; LOAD: [[THEN0]]:
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP0]])
+; LOAD: br label %[[AFTER0]]
+; LOAD: [[AFTER0]]:
+
+; LOAD: [[MASK1:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 1
+; LOAD: br i1 [[MASK1]], label %[[THEN1:[0-9A-Za-z]+]], label %[[AFTER1:[0-9A-Za-z]+]]
+; LOAD: [[THEN1]]:
+; LOAD: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1
+; LOAD: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP1]])
+; LOAD: br label %[[AFTER1]]
+; LOAD: [[AFTER1]]:
+
+; LOAD: [[MASK2:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 2
+; LOAD: br i1 [[MASK2]], label %[[THEN2:[0-9A-Za-z]+]], label %[[AFTER2:[0-9A-Za-z]+]]
+; LOAD: [[THEN2]]:
+; LOAD: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; LOAD: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP2]])
+; LOAD: br label %[[AFTER2]]
+; LOAD: [[AFTER2]]:
+
+; LOAD: [[MASK3:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 3
+; LOAD: br i1 [[MASK3]], label %[[THEN3:[0-9A-Za-z]+]], label %[[AFTER3:[0-9A-Za-z]+]]
+; LOAD: [[THEN3]]:
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP3]])
+; LOAD: br label %[[AFTER3]]
+; LOAD: [[AFTER3]]:
+
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
+  %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
+  ret <4 x float> %res
+}
+
+;; Load using two masked.loads, which should instrument them both.
+define <4 x float> @load.v4f32.1001.split(<4 x float> %arg) {
+; BOTH-LABEL: @load.v4f32.1001
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP0]])
+; LOAD: %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %arg)
+  %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %arg)
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP3]])
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %res)
+  %res2 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %res)
+  ret <4 x float> %res2
+}
diff --git a/llvm/test/Instrumentation/HeapProfiler/scale-granularity.ll b/llvm/test/Instrumentation/HeapProfiler/scale-granularity.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Instrumentation/HeapProfiler/scale-granularity.ll
@@ -0,0 +1,29 @@
+; Test that the scale (-heapprof-mapping-scale) and granularity (-heapprof-mapping-granularity) command-line options work as expected
+;
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-granularity 32 -S | FileCheck --check-prefix=CHECK-GRAN %s
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-scale 1 -S | FileCheck --check-prefix=CHECK-SCALE %s
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-granularity 16 -heapprof-mapping-scale 0 -S | FileCheck --check-prefix=CHECK-BOTH %s
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @read(i32* %a) {
+entry:
+  %tmp1 = load i32, i32* %a, align 4
+  ret i32 %tmp1
+}
+; CHECK-GRAN-LABEL: @read
+; CHECK-GRAN-NOT:     ret
+; CHECK-GRAN:         and {{.*}} -32
+; CHECK-GRAN-NEXT:    lshr {{.*}} 3
+; CHECK-GRAN:         ret
+
+; CHECK-SCALE-LABEL: @read
+; CHECK-SCALE-NOT:     ret
+; CHECK-SCALE:         and {{.*}} -64
+; CHECK-SCALE-NEXT:    lshr {{.*}} 1
+; CHECK-SCALE:         ret
+
+; CHECK-BOTH-LABEL: @read
+; CHECK-BOTH-NOT:     ret
+; CHECK-BOTH:         and {{.*}} -16
+; CHECK-BOTH-NEXT:    lshr {{.*}} 0
+; CHECK-BOTH:         ret
diff --git a/llvm/test/Instrumentation/HeapProfiler/version-mismatch-check.ll b/llvm/test/Instrumentation/HeapProfiler/version-mismatch-check.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Instrumentation/HeapProfiler/version-mismatch-check.ll
@@ -0,0 +1,12 @@
+; Check that the HeapProf module constructor guards against compiler/runtime version
+; mismatch.
+
+; RUN: opt < %s -heapprof-module -S | FileCheck %s
+; RUN: opt < %s -heapprof-module -heapprof-guard-against-version-mismatch=0 -S | FileCheck %s --check-prefix=NOGUARD
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: define internal void @heapprof.module_ctor()
+; CHECK:         call void @__heapprof_version_mismatch_check_v1
+; NOGUARD-NOT:   call void @__heapprof_version_mismatch_check_