diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -88,6 +88,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -432,8 +433,8 @@
 
   Value *getShadowOffset(Value *Addr, IRBuilder<> &IRB);
   Value *getShadowAddress(Value *Addr, Instruction *Pos);
-  // std::pair<Value *, Value *>
-  // getShadowOriginAddress(Value *Addr, Align InstAlignment, Instruction *Pos);
+  std::pair<Value *, Value *>
+  getShadowOriginAddress(Value *Addr, Align InstAlignment, Instruction *Pos);
   bool isInstrumented(const Function *F);
   bool isInstrumented(const GlobalAlias *GA);
   FunctionType *getArgsFunctionType(FunctionType *T);
@@ -506,6 +507,8 @@
   DenseMap<Value *, Value *> ValShadowMap;
   DenseMap<Value *, Value *> ValOriginMap;
   DenseMap<AllocaInst *, AllocaInst *> AllocaShadowMap;
+  DenseMap<AllocaInst *, AllocaInst *> AllocaOriginMap;
+
   std::vector<std::pair<PHINode *, PHINode *>> PHIFixups;
   DenseSet<Instruction *> SkipInsts;
   std::vector<Value *> NonZeroChecks;
@@ -572,8 +575,9 @@
   Value *combineShadowsThenConvert(Type *T, Value *V1, Value *V2,
                                    Instruction *Pos);
   Value *combineOperandShadows(Instruction *Inst);
-  Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align,
-                    Instruction *Pos);
+  std::pair<Value *, Value *> loadShadowOrigin(Value *ShadowAddr, uint64_t Size,
+                                               Align InstAlignment,
+                                               Instruction *Pos);
   void storePrimitiveShadow(Value *Addr, uint64_t Size, Align Alignment,
                             Value *PrimitiveShadow, Instruction *Pos);
   /// Applies PrimitiveShadow to all primitive subtypes of T, returning
@@ -615,8 +619,20 @@
                               Align ShadowAlign, Instruction *Pos);
 
   /// The fast path of loading shadow in fast-16-label mode.
-  Value *loadFast16ShadowFast(Value *ShadowAddr, uint64_t Size,
-                              Align ShadowAlign, Instruction *Pos);
+  std::pair<Value *, Value *>
+  loadFast16ShadowFast(Value *ShadowAddr, Value *OriginAddr, uint64_t Size,
+                       Align ShadowAlign, Align OriginAlign, Value *FirstOrigin,
+                       Instruction *Pos);
+
+  Align getOriginAlign(Align InstAlignment);
+
+  /// Because 4 contiguous bytes share one 4-byte origin, the most accurate load
+  /// is __dfsan_load_label_and_origin. This function returns the union of all
+  /// labels and the origin of the first taint label. However this is an
+  /// additional call with many instructions. To ensure common cases are fast,
+  /// checks if it is possible to load labels and origins without using the
+  /// callback function.
+  bool useCallbackLoadLabelAndOrigin(uint64_t Size, Align InstAlignment);
 };
 
 class DFSanVisitor : public InstVisitor<DFSanVisitor> {
@@ -1683,7 +1699,7 @@
   return IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy),
                        IRB.CreatePtrToInt(ShadowPtrMaskValue, IntptrTy));
 }
-/*
+
 std::pair<Value *, Value *>
 DataFlowSanitizer::getShadowOriginAddress(Value *Addr, Align InstAlignment,
                                           Instruction *Pos) {
@@ -1706,7 +1722,7 @@
   }
   return {ShadowPtr, OriginPtr};
 }
-*/
+
 Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) {
   // Returns (Addr & shadow_mask) x 2
   IRBuilder<> IRB(Pos);
@@ -1881,17 +1897,51 @@
   return Align(Alignment.value() * DFS.ShadowWidthBytes);
 }
 
-Value *DFSanFunction::loadFast16ShadowFast(Value *ShadowAddr, uint64_t Size,
-                                           Align ShadowAlign,
-                                           Instruction *Pos) {
+Align DFSanFunction::getOriginAlign(Align InstAlignment) {
+  const Align Alignment = llvm::assumeAligned(InstAlignment.value());
+  return Align(std::max(kMinOriginAlignment, Alignment));
+}
+
+bool DFSanFunction::useCallbackLoadLabelAndOrigin(uint64_t Size,
+                                                  Align InstAlignment) {
+  assert(Size != 0);
+  // * if Size == 1, it is sufficient to load its origin aligned at 4.
+  // * if Size == 2, we assume most cases Addr % 2 == 0, so it is sufficient to
+  //   load its origin aligned at 4. If not, although origins may be lost, it
+  //   should not happen very often.
+  // * if align >= 4, Addr must be aligned to 4, otherwise it is UB. When
+  //   Size % 4 == 0, it is more efficient to load origins without callbacks.
+  // * Otherwise we use __dfsan_load_label_and_origin.
+  // This should ensure that common cases run efficiently.
+  if (Size <= 2)
+    return false;
+
+  const Align Alignment = llvm::assumeAligned(InstAlignment.value());
+  if (Alignment >= kMinOriginAlignment &&
+      Size % (64 / DFS.ShadowWidthBits) == 0)
+    return false;
+
+  return true;
+}
+
+std::pair<Value *, Value *> DFSanFunction::loadFast16ShadowFast(
+    Value *ShadowAddr, Value *OriginAddr, uint64_t Size, Align ShadowAlign,
+    Align OriginAlign, Value *FirstOrigin, Instruction *Pos) {
   // First OR all the WideShadows, then OR individual shadows within the
   // combined WideShadow. This is fewer instructions than ORing shadows
   // individually.
+  const bool ShouldTrackOrigins = DFS.shouldTrackOrigins();
+  std::vector<Value *> Shadows;
+  std::vector<Value *> Origins;
   IRBuilder<> IRB(Pos);
   Value *WideAddr =
       IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
   Value *CombinedWideShadow =
       IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
+  if (ShouldTrackOrigins) {
+    Shadows.push_back(CombinedWideShadow);
+    Origins.push_back(FirstOrigin);
+  }
   for (uint64_t Ofs = 64 / DFS.ShadowWidthBits; Ofs != Size;
        Ofs += 64 / DFS.ShadowWidthBits) {
     WideAddr = IRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr,
@@ -1899,12 +1949,23 @@
     Value *NextWideShadow =
         IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
     CombinedWideShadow = IRB.CreateOr(CombinedWideShadow, NextWideShadow);
+    if (ShouldTrackOrigins) {
+      Shadows.push_back(NextWideShadow);
+      OriginAddr = IRB.CreateGEP(DFS.OriginTy, OriginAddr,
+                                 ConstantInt::get(DFS.IntptrTy, 1));
+      Origins.push_back(
+          IRB.CreateAlignedLoad(DFS.OriginTy, OriginAddr, OriginAlign));
+    }
   }
   for (unsigned Width = 32; Width >= DFS.ShadowWidthBits; Width >>= 1) {
     Value *ShrShadow = IRB.CreateLShr(CombinedWideShadow, Width);
     CombinedWideShadow = IRB.CreateOr(CombinedWideShadow, ShrShadow);
   }
-  return IRB.CreateTrunc(CombinedWideShadow, DFS.PrimitiveShadowTy);
+  return {IRB.CreateTrunc(CombinedWideShadow, DFS.PrimitiveShadowTy),
+          ShouldTrackOrigins
+              ? combineOrigins(Shadows, Origins, Pos,
+                               ConstantInt::getSigned(IRB.getInt64Ty(), 0))
+              : DFS.ZeroOrigin};
 }
 
 Value *DFSanFunction::loadLegacyShadowFast(Value *ShadowAddr, uint64_t Size,
@@ -1977,17 +2038,27 @@
 // Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where
 // Addr has alignment Align, and take the union of each of those shadows. The
 // returned shadow always has primitive type.
-Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
-                                 Instruction *Pos) {
+std::pair<Value *, Value *> DFSanFunction::loadShadowOrigin(Value *Addr,
+                                                            uint64_t Size,
+                                                            Align InstAlignment,
+                                                            Instruction *Pos) {
+  const bool ShouldTrackOrigins = DFS.shouldTrackOrigins();
+
+  // Non-escaped loads.
   if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
-    const auto I = AllocaShadowMap.find(AI);
-    if (I != AllocaShadowMap.end()) {
+    const auto SI = AllocaShadowMap.find(AI);
+    if (SI != AllocaShadowMap.end()) {
       IRBuilder<> IRB(Pos);
-      return IRB.CreateLoad(DFS.PrimitiveShadowTy, I->second);
+      Value *ShadowLI = IRB.CreateLoad(DFS.PrimitiveShadowTy, SI->second);
+      const auto OI = AllocaOriginMap.find(AI);
+      assert(!ShouldTrackOrigins || OI != AllocaOriginMap.end());
+      return {ShadowLI, ShouldTrackOrigins
+                            ? IRB.CreateLoad(DFS.OriginTy, OI->second)
+                            : nullptr};
     }
   }
 
-  const llvm::Align ShadowAlign(Align * DFS.ShadowWidthBytes);
+  // Load from constant addresses.
   SmallVector<const Value *, 2> Objs;
   getUnderlyingObjects(Addr, Objs);
   bool AllConstants = true;
@@ -2001,33 +2072,65 @@
     break;
   }
   if (AllConstants)
-    return DFS.ZeroPrimitiveShadow;
+    return {DFS.ZeroPrimitiveShadow,
+            ShouldTrackOrigins ? DFS.ZeroOrigin : nullptr};
+
+  if (Size == 0)
+    return {DFS.ZeroPrimitiveShadow,
+            ShouldTrackOrigins ? DFS.ZeroOrigin : nullptr};
+
+  // Use callback to load if this is not an optimizable case for origin
+  // tracking.
+  if (ShouldTrackOrigins &&
+      useCallbackLoadLabelAndOrigin(Size, InstAlignment)) {
+    IRBuilder<> IRB(Pos);
+    CallInst *Call =
+        IRB.CreateCall(DFS.DFSanLoadLabelAndOriginFn,
+                       {IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                        ConstantInt::get(DFS.IntptrTy, Size)});
+    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+    return {IRB.CreateTrunc(IRB.CreateLShr(Call, DFS.OriginWidthBits),
+                            DFS.PrimitiveShadowTy),
+            IRB.CreateTrunc(Call, DFS.OriginTy)};
+  }
+
+  // Other cases that support loading shadows or origins in a fast way.
+  Value *ShadowAddr, *OriginAddr;
+  std::tie(ShadowAddr, OriginAddr) =
+      DFS.getShadowOriginAddress(Addr, InstAlignment, Pos);
+
+  const Align ShadowAlign = getShadowAlign(InstAlignment);
+  const Align OriginAlign = getOriginAlign(InstAlignment);
+  Value *Origin = nullptr;
+  if (ShouldTrackOrigins) {
+    IRBuilder<> IRB(Pos);
+    Origin = IRB.CreateAlignedLoad(DFS.OriginTy, OriginAddr, OriginAlign);
+  }
 
-  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
   switch (Size) {
-  case 0:
-    return DFS.ZeroPrimitiveShadow;
   case 1: {
     LoadInst *LI = new LoadInst(DFS.PrimitiveShadowTy, ShadowAddr, "", Pos);
     LI->setAlignment(ShadowAlign);
-    return LI;
+    return {LI, Origin};
   }
   case 2: {
     IRBuilder<> IRB(Pos);
     Value *ShadowAddr1 = IRB.CreateGEP(DFS.PrimitiveShadowTy, ShadowAddr,
                                        ConstantInt::get(DFS.IntptrTy, 1));
-    return combineShadows(
-        IRB.CreateAlignedLoad(DFS.PrimitiveShadowTy, ShadowAddr, ShadowAlign),
-        IRB.CreateAlignedLoad(DFS.PrimitiveShadowTy, ShadowAddr1, ShadowAlign),
-        Pos);
+    Value *Load =
+        IRB.CreateAlignedLoad(DFS.PrimitiveShadowTy, ShadowAddr, ShadowAlign);
+    Value *Load1 =
+        IRB.CreateAlignedLoad(DFS.PrimitiveShadowTy, ShadowAddr1, ShadowAlign);
+    return {combineShadows(Load, Load1, Pos), Origin};
   }
   }
 
   if (ClFast16Labels && Size % (64 / DFS.ShadowWidthBits) == 0)
-    return loadFast16ShadowFast(ShadowAddr, Size, ShadowAlign, Pos);
+    return loadFast16ShadowFast(ShadowAddr, OriginAddr, Size, ShadowAlign,
+                                OriginAlign, Origin, Pos);
 
   if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidthBits) == 0)
-    return loadLegacyShadowFast(ShadowAddr, Size, ShadowAlign, Pos);
+    return {loadLegacyShadowFast(ShadowAddr, Size, ShadowAlign, Pos), Origin};
 
   IRBuilder<> IRB(Pos);
   FunctionCallee &UnionLoadFn =
@@ -2035,7 +2138,7 @@
   CallInst *FallbackCall = IRB.CreateCall(
       UnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
   FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
-  return FallbackCall;
+  return {FallbackCall, Origin};
 }
 
 static AtomicOrdering addAcquireOrdering(AtomicOrdering AO) {
@@ -2060,6 +2163,7 @@
   uint64_t Size = DL.getTypeStoreSize(LI.getType());
   if (Size == 0) {
     DFSF.setShadow(&LI, DFSF.DFS.getZeroShadow(&LI));
+    DFSF.setOrigin(&LI, DFSF.DFS.ZeroOrigin);
     return;
   }
 
@@ -2071,13 +2175,24 @@
   if (LI.isAtomic())
     LI.setOrdering(addAcquireOrdering(LI.getOrdering()));
 
-  Align Alignment = ClPreserveAlignment ? LI.getAlign() : Align(1);
   Instruction *Pos = LI.isAtomic() ? LI.getNextNode() : &LI;
-  Value *PrimitiveShadow =
-      DFSF.loadShadow(LI.getPointerOperand(), Size, Alignment.value(), Pos);
+  std::vector<Value *> Shadows;
+  std::vector<Value *> Origins;
+  Value *PrimitiveShadow, *Origin;
+  std::tie(PrimitiveShadow, Origin) =
+      DFSF.loadShadowOrigin(LI.getPointerOperand(), Size, LI.getAlign(), Pos);
+  const bool ShouldTrackOrigins = DFSF.DFS.shouldTrackOrigins();
+  if (ShouldTrackOrigins) {
+    Shadows.push_back(PrimitiveShadow);
+    Origins.push_back(Origin);
+  }
   if (ClCombinePointerLabelsOnLoad) {
     Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand());
     PrimitiveShadow = DFSF.combineShadows(PrimitiveShadow, PtrShadow, Pos);
+    if (ShouldTrackOrigins) {
+      Shadows.push_back(PtrShadow);
+      Origins.push_back(DFSF.getOrigin(LI.getPointerOperand()));
+    }
   }
   if (!DFSF.DFS.isZeroShadow(PrimitiveShadow))
     DFSF.NonZeroChecks.push_back(PrimitiveShadow);
@@ -2085,6 +2200,11 @@
   Value *Shadow =
       DFSF.expandFromPrimitiveShadow(LI.getType(), PrimitiveShadow, Pos);
   DFSF.setShadow(&LI, Shadow);
+
+  if (ShouldTrackOrigins) {
+    DFSF.setOrigin(&LI, DFSF.combineOrigins(Shadows, Origins, Pos));
+  }
+
   if (ClEventCallbacks) {
     IRBuilder<> IRB(Pos);
     Value *Addr8 = IRB.CreateBitCast(LI.getPointerOperand(), DFSF.DFS.Int8Ptr);
@@ -2322,8 +2442,13 @@
   if (AllLoadsStores) {
     IRBuilder<> IRB(&I);
     DFSF.AllocaShadowMap[&I] = IRB.CreateAlloca(DFSF.DFS.PrimitiveShadowTy);
+    if (DFSF.DFS.shouldTrackOrigins()) {
+      DFSF.AllocaOriginMap[&I] =
+          IRB.CreateAlloca(DFSF.DFS.OriginTy, nullptr, "_dfsa");
+    }
   }
   DFSF.setShadow(&I, DFSF.DFS.ZeroPrimitiveShadow);
+  DFSF.setOrigin(&I, DFSF.DFS.ZeroOrigin);
 }
 
 void DFSanVisitor::visitSelectInst(SelectInst &I) {
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/origin_ldst.ll b/llvm/test/Instrumentation/DataFlowSanitizer/origin_ldst.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/origin_ldst.ll
@@ -0,0 +1,251 @@
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK_META,CHECK
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -dfsan-combine-pointer-labels-on-load=false -S | FileCheck %s --check-prefixes=CHECK_META,NO_COMBINE_LOAD_PTR
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK_META: @__dfsan_shadow_width_bits = weak_odr constant i32 [[#SBITS:]]
+; CHECK_META: @__dfsan_shadow_width_bytes = weak_odr constant i32 [[#SBYTES:]]
+
+define {} @load0({}* %p) {
+  ; CHECK: @"dfs$load0"
+  ; CHECK-NEXT: %a = load {}, {}* %p, align 1
+  ; CHECK-NEXT: store {} zeroinitializer, {}* bitcast ([100 x i64]* @__dfsan_retval_tls to {}*), align [[#SBYTES]]
+  ; CHECK-NEXT: store i32 0, i32* @__dfsan_retval_origin_tls, align 4
+  ; CHECK-NEXT: ret {} %a
+
+  %a = load {}, {}* %p
+  ret {} %a
+}
+
+define i16 @load_non_escaped_alloca() {
+  ; CHECK: @"dfs$load_non_escaped_alloca"
+  ; CHECK: [[S_ALLOCA:%.*]] = alloca i[[#SBITS]], align [[#SBYTES]]
+  ; CHECK: [[O_ALLOCA:%.*]] = alloca i32, align 4
+  ; CHECK: [[SHADOW:%.*]] = load i[[#SBITS]], i[[#SBITS]]* [[S_ALLOCA]], align [[#SBYTES]]
+  ; CHECK: [[ORIGIN:%.*]] = load i32, i32* [[O_ALLOCA]], align 4
+  ; CHECK: %a = load i16, i16* %p, align 2
+  ; CHECK: store i[[#SBITS]] [[SHADOW]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
+  ; CHECK: store i32 [[ORIGIN]], i32* @__dfsan_retval_origin_tls, align 4
+  
+  %p = alloca i16
+  %a = load i16, i16* %p
+  ret i16 %a
+}
+
+define i16* @load_escaped_alloca() {
+  ; CHECK: @"dfs$load_escaped_alloca"
+  ; CHECK: [[INTP:%.*]] = ptrtoint i[[#SBITS]]* %p to i64
+  ; CHECK: [[OFFSET:%.*]] = and i64 [[INTP]], -123145302310913
+  ; CHECK: [[SHADOW_ADDR:%.*]] = mul i64 [[OFFSET]], 2
+  ; CHECK: [[SHADOW_PTR0:%.*]] = inttoptr i64 [[SHADOW_ADDR]] to i[[#SBITS]]*
+  ; CHECK: [[ORIGIN_OFFSET:%.*]] = add i64 [[OFFSET]], 35184372088832
+  ; CHECK: [[ORIGIN_ADDR:%.*]] = and i64 [[ORIGIN_OFFSET]], -4
+  ; CHECK: [[ORIGIN_PTR:%.*]] = inttoptr i64 [[ORIGIN_ADDR]] to i32*
+  ; CHECK: {{%.*}} = load i32, i32* [[ORIGIN_PTR]], align 4
+  ; CHECK: [[SHADOW_PTR1:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[SHADOW_PTR0]], i64 1
+  ; CHECK: [[SHADOW0:%.*]] = load i[[#SBITS]], i[[#SBITS]]* [[SHADOW_PTR0]], align [[#SBYTES]]
+  ; CHECK: [[SHADOW1:%.*]] = load i[[#SBITS]], i[[#SBITS]]* [[SHADOW_PTR1]], align [[#SBYTES]]
+  ; CHECK: {{%.*}} = or i[[#SBITS]] [[SHADOW0]], [[SHADOW1]]
+  ; CHECK: %a = load i16, i16* %p, align 2
+  ; CHECK: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
+  ; CHECK: store i32 0, i32* @__dfsan_retval_origin_tls, align 4
+  
+  %p = alloca i16
+  %a = load i16, i16* %p
+  ret i16* %p
+}
+
+@X = constant i1 1
+define i1 @load_global() {
+  ; CHECK: @"dfs$load_global"
+  ; CHECK: %a = load i1, i1* @X, align 1
+  ; CHECK: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
+  ; CHECK: store i32 0, i32* @__dfsan_retval_origin_tls, align 4
+
+  %a = load i1, i1* @X
+  ret i1 %a
+}
+
+define i1 @load1(i1* %p) {
+  ; CHECK: @"dfs$load1"
+  ; CHECK: [[PO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
+  ; CHECK: [[PS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[#SBYTES]]
+  ; CHECK: [[INTP:%.*]] = ptrtoint {{.*}} %p to i64
+  ; CHECK: [[OFFSET:%.*]] = and i64 [[INTP]], -123145302310913
+  ; CHECK: [[SHADOW_ADDR:%.*]] = mul i64 [[OFFSET]], 2
+  ; CHECK: [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_ADDR]] to i[[#SBITS]]*
+  ; CHECK: [[ORIGIN_OFFSET:%.*]] = add i64 [[OFFSET]], 35184372088832
+  ; CHECK: [[ORIGIN_ADDR:%.*]] = and i64 [[ORIGIN_OFFSET]], -4
+  ; CHECK: [[ORIGIN_PTR:%.*]] = inttoptr i64 [[ORIGIN_ADDR]] to i32*
+  ; CHECK: [[AO:%.*]] = load i32, i32* [[ORIGIN_PTR]], align 4
+  ; CHECK: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* [[SHADOW_PTR]], align [[#SBYTES]]
+  ; CHECK: [[RS:%.*]] = or i[[#SBITS]] [[AS]], [[PS]]
+  ; CHECK: [[PS_NZ:%.*]] = icmp ne i[[#SBITS]] [[PS]], 0
+  ; CHECK: [[RO:%.*]] = select i1 [[PS_NZ]], i32 [[PO]], i32 [[AO]]
+  ; CHECK: %a = load i1, i1* %p, align 1
+  ; CHECK: store i[[#SBITS]] [[RS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
+  ; CHECK: store i32 [[RO]], i32* @__dfsan_retval_origin_tls, align 4
+
+  %a = load i1, i1* %p
+  ret i1 %a
+}
+
+define i16 @load16(i1 %i, i16* %p) {
+  ; CHECK: @"dfs$load16"
+  ; CHECK: [[PO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
+  ; CHECK: [[PS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[#SBYTES]]
+  ; CHECK: [[INTP:%.*]] = ptrtoint {{.*}} %p to i64
+  ; CHECK: [[OFFSET:%.*]] = and i64 [[INTP]], -123145302310913
+  ; CHECK: [[SHADOW_ADDR:%.*]] = mul i64 [[OFFSET]], 2
+  ; CHECK: [[SHADOW_PTR0:%.*]] = inttoptr i64 [[SHADOW_ADDR]] to i[[#SBITS]]*
+  ; CHECK: [[ORIGIN_OFFSET:%.*]] = add i64 [[OFFSET]], 35184372088832
+  ; CHECK: [[ORIGIN_ADDR:%.*]] = and i64 [[ORIGIN_OFFSET]], -4
+  ; CHECK: [[ORIGIN_PTR:%.*]] = inttoptr i64 [[ORIGIN_ADDR]] to i32*
+  ; CHECK: [[AO:%.*]] = load i32, i32* [[ORIGIN_PTR]], align 4
+  ; CHECK: [[SHADOW_PTR1:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[SHADOW_PTR0]], i64 1
+  ; CHECK: [[SHADOW0:%.*]] = load i[[#SBITS]], i[[#SBITS]]* [[SHADOW_PTR0]], align [[#SBYTES]]
+  ; CHECK: [[SHADOW1:%.*]] = load i[[#SBITS]], i[[#SBITS]]* [[SHADOW_PTR1]], align [[#SBYTES]]
+  ; CHECK: [[AS:%.*]] = or i[[#SBITS]] [[SHADOW0]], [[SHADOW1]]
+  ; CHECK: [[RS:%.*]] = or i[[#SBITS]] [[AS]], [[PS]]
+  ; CHECK: [[PS_NZ:%.*]] = icmp ne i[[#SBITS]] [[PS]], 0
+  ; CHECK: [[RO:%.*]] = select i1 [[PS_NZ]], i32 [[PO]], i32 [[AO]]
+  ; CHECK: %a = load i16, i16* %p, align 2
+  ; CHECK: store i[[#SBITS]] [[RS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
+  ; CHECK: store i32 [[RO]], i32* @__dfsan_retval_origin_tls, align 4
+
+  %a = load i16, i16* %p
+  ret i16 %a
+}
+
+define i32 @load32(i32* %p) {
+  ; CHECK: @"dfs$load32"
+
+  ; NO_COMBINE_LOAD_PTR: @"dfs$load32"
+  ; NO_COMBINE_LOAD_PTR: [[INTP:%.*]] = ptrtoint i32* %p to i64
+  ; NO_COMBINE_LOAD_PTR: [[OFFSET:%.*]] = and i64 [[INTP]], -123145302310913
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_ADDR:%.*]] = mul i64 [[OFFSET]], 2
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_ADDR]] to i[[#SBITS]]*
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN_ADDR:%.*]] = add i64 [[OFFSET]], 35184372088832
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN_PTR:%.*]] = inttoptr i64 [[ORIGIN_ADDR]] to i32*
+  ; NO_COMBINE_LOAD_PTR: [[AO:%.*]] = load i32, i32* [[ORIGIN_PTR]], align 4
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_PTR64:%.*]] = bitcast i[[#SBITS]]* [[SHADOW_PTR]] to i64*
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW64:%.*]] = load i64, i64* [[SHADOW_PTR64]], align [[#SBYTES]]
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW64_H32:%.*]] = lshr i64 [[SHADOW64]], 32
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW64_HL32:%.*]] = or i64 [[SHADOW64]], [[SHADOW64_H32]]
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW64_HL32_H16:%.*]] = lshr i64 [[SHADOW64_HL32]], 16
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW64_HL32_HL16:%.*]] = or i64 [[SHADOW64_HL32]], [[SHADOW64_HL32_H16]]
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW:%.*]] = trunc i64 [[SHADOW64_HL32_HL16]] to i[[#SBITS]]
+  ; NO_COMBINE_LOAD_PTR: %a = load i32, i32* %p, align 4
+  ; NO_COMBINE_LOAD_PTR: store i[[#SBITS]] [[SHADOW]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
+  ; NO_COMBINE_LOAD_PTR: store i32 [[AO]], i32* @__dfsan_retval_origin_tls, align 4
+
+  %a = load i32, i32* %p
+  ret i32 %a
+}
+
+define i64 @load64(i64* %p) {
+  ; CHECK: @"dfs$load64"
+  
+  ; NO_COMBINE_LOAD_PTR: @"dfs$load64"
+  ; NO_COMBINE_LOAD_PTR: [[INTP:%.*]] = ptrtoint i64* %p to i64
+  ; NO_COMBINE_LOAD_PTR: [[OFFSET:%.*]] = and i64 [[INTP]], -123145302310913
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_ADDR:%.*]] = mul i64 [[OFFSET]], 2
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_ADDR]] to i[[#SBITS]]*
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN_ADDR:%.*]] = add i64 [[OFFSET]], 35184372088832
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN_PTR_0:%.*]] = inttoptr i64 [[ORIGIN_ADDR]] to i32*
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN_0:%.*]] = load i32, i32* [[ORIGIN_PTR_0]], align 8
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_PTR_0:%.*]] = bitcast i[[#SBITS]]* [[SHADOW_PTR]] to i64*
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_0:%.*]] = load i64, i64* [[SHADOW_PTR_0]], align [[#SBYTES]]
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_PTR_1:%.*]] = getelementptr i64, i64* [[SHADOW_PTR_0]], i64 1
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_1:%.*]] = load i64, i64* [[SHADOW_PTR_1]], align [[#SBYTES]]
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW64:%.*]] = or i64 [[SHADOW_0]], [[SHADOW_1]]
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN_PTR_1:%.*]] = getelementptr i32, i32* [[ORIGIN_PTR_0]], i64 1
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN_1:%.*]] = load i32, i32* [[ORIGIN_PTR_1]], align 8
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW64_H32:%.*]] = lshr i64 [[SHADOW64]], 32
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW64_HL32:%.*]] = or i64 [[SHADOW64]], [[SHADOW64_H32]]
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW64_HL32_H16:%.*]] = lshr i64 [[SHADOW64_HL32]], 16
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW64_HL32_HL16:%.*]] = or i64 [[SHADOW64_HL32]], [[SHADOW64_HL32_H16]]
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW:%.*]] = trunc i64 [[SHADOW64_HL32_HL16]] to i[[#SBITS]]
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_1_NZ:%.*]] = icmp ne i64 [[SHADOW_1]], 0
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN:%.*]] = select i1 [[SHADOW_1_NZ]], i32 [[ORIGIN_1]], i32 [[ORIGIN_0]]
+  ; NO_COMBINE_LOAD_PTR: %a = load i64, i64* %p, align 8
+  ; NO_COMBINE_LOAD_PTR: store i[[#SBITS]] [[SHADOW]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
+  ; NO_COMBINE_LOAD_PTR: store i32 [[ORIGIN]], i32* @__dfsan_retval_origin_tls, align 4
+
+  %a = load i64, i64* %p
+  ret i64 %a
+}
+
+define i64 @load64_align2(i64* %p) {
+  ; CHECK: @"dfs$load64_align2"
+
+  ; NO_COMBINE_LOAD_PTR: @"dfs$load64_align2"
+  ; NO_COMBINE_LOAD_PTR-NEXT: [[INTP:%.*]] = bitcast i64* %p to i8*
+  ; NO_COMBINE_LOAD_PTR-NEXT: [[LABEL_ORIGIN:%.*]] = call zeroext i64 @__dfsan_load_label_and_origin(i8* [[INTP]], i64 8)
+  ; NO_COMBINE_LOAD_PTR-NEXT: [[LABEL_ORIGIN_H32:%.*]] = lshr i64 [[LABEL_ORIGIN]], 32
+  ; NO_COMBINE_LOAD_PTR-NEXT: [[LABEL:%.*]] = trunc i64 [[LABEL_ORIGIN_H32]] to i[[#SBITS]]
+  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN:%.*]] = trunc i64 [[LABEL_ORIGIN]] to i32
+  ; NO_COMBINE_LOAD_PTR-NEXT: %a = load i64, i64* %p, align [[#SBYTES]]
+  ; NO_COMBINE_LOAD_PTR-NEXT: store i[[#SBITS]] [[LABEL]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
+  ; NO_COMBINE_LOAD_PTR-NEXT: store i32 [[ORIGIN]], i32* @__dfsan_retval_origin_tls, align 4
+  
+  %a = load i64, i64* %p, align 2
+  ret i64 %a
+}
+
+define i92 @load92(i92* %p) {
+  ; CHECK: @"dfs$load92"
+
+  ; NO_COMBINE_LOAD_PTR: @"dfs$load92"
+  ; NO_COMBINE_LOAD_PTR: [[INTP:%.*]] = ptrtoint i92* %p to i64
+  ; NO_COMBINE_LOAD_PTR: [[OFFSET:%.*]] = and i64 [[INTP]], -123145302310913
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_ADDR:%.*]] = mul i64 [[OFFSET]], 2
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_ADDR]] to i[[#SBITS]]*
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN_ADDR:%.*]] = add i64 [[OFFSET]], 35184372088832
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN_PTR_0:%.*]] = inttoptr i64 [[ORIGIN_ADDR]] to i32*
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN_0:%.*]] = load i32, i32* [[ORIGIN_PTR_0]], align 8
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_PTR_0:%.*]] = bitcast i[[#SBITS]]* [[SHADOW_PTR]] to i64*
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_0:%.*]] = load i64, i64* [[SHADOW_PTR_0]], align [[#SBYTES]]
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_PTR_1:%.*]] = getelementptr i64, i64* [[SHADOW_PTR_0]], i64 1
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_1:%.*]] = load i64, i64* [[SHADOW_PTR_1]], align [[#SBYTES]]
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_01:%.*]] = or i64 [[SHADOW_0]], [[SHADOW_1]]
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN_PTR_1:%.*]] = getelementptr i32, i32* [[ORIGIN_PTR_0]], i64 1
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN_1:%.*]] = load i32, i32* [[ORIGIN_PTR_1]], align 8
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_PTR_2:%.*]] = getelementptr i64, i64* [[SHADOW_PTR_1]], i64 1
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_2:%.*]] = load i64, i64* [[SHADOW_PTR_2]], align [[#SBYTES]]
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW64:%.*]] = or i64 [[SHADOW_01]], [[SHADOW_2]]
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN_PTR_2:%.*]] = getelementptr i32, i32* [[ORIGIN_PTR_1]], i64 1
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN_2:%.*]] = load i32, i32* [[ORIGIN_PTR_2]], align 8
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW64_H32:%.*]] = lshr i64 [[SHADOW64]], 32
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW64_HL32:%.*]] = or i64 [[SHADOW64]], [[SHADOW64_H32]]
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW64_HL32_H16:%.*]] = lshr i64 [[SHADOW64_HL32]], 16
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW64_HL32_HL16:%.*]] = or i64 [[SHADOW64_HL32]], [[SHADOW64_HL32_H16]]
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW:%.*]] = trunc i64 [[SHADOW64_HL32_HL16]] to i[[#SBITS]]
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_1_NZ:%.*]] = icmp ne i64 [[SHADOW_1]], 0
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN_10:%.*]] = select i1 [[SHADOW_1_NZ]], i32 [[ORIGIN_1]], i32 [[ORIGIN_0]]
+  ; NO_COMBINE_LOAD_PTR: [[SHADOW_2_NZ:%.*]] = icmp ne i64 [[SHADOW_2]], 0
+  ; NO_COMBINE_LOAD_PTR: [[ORIGIN:%.*]] = select i1 [[SHADOW_2_NZ]], i32 [[ORIGIN_2]], i32 [[ORIGIN_10]]
+  ; NO_COMBINE_LOAD_PTR: %a = load i92, i92* %p, align 8
+  ; NO_COMBINE_LOAD_PTR: store i[[#SBITS]] [[SHADOW]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
+  ; NO_COMBINE_LOAD_PTR: store i32 [[ORIGIN]], i32* @__dfsan_retval_origin_tls, align 4
+  
+  %a = load i92, i92* %p
+  ret i92 %a
+}
+
+define i17 @load17(i17* %p) {
+  ; CHECK: @"dfs$load17"
+
+  ; NO_COMBINE_LOAD_PTR: @"dfs$load17"
+  ; NO_COMBINE_LOAD_PTR-NEXT: [[INTP:%.*]] = bitcast i17* %p to i8*
+  ; NO_COMBINE_LOAD_PTR-NEXT: [[LABEL_ORIGIN:%.*]] = call zeroext i64 @__dfsan_load_label_and_origin(i8* [[INTP]], i64 3)
+  ; NO_COMBINE_LOAD_PTR-NEXT: [[LABEL_ORIGIN_H32:%.*]] = lshr i64 [[LABEL_ORIGIN]], 32
+  ; NO_COMBINE_LOAD_PTR-NEXT: [[LABEL:%.*]] = trunc i64 [[LABEL_ORIGIN_H32]] to i[[#SBITS]]
+  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN:%.*]] = trunc i64 [[LABEL_ORIGIN]] to i32
+  ; NO_COMBINE_LOAD_PTR-NEXT: %a = load i17, i17* %p, align 4
+  ; NO_COMBINE_LOAD_PTR-NEXT: store i[[#SBITS]] [[LABEL]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
+  ; NO_COMBINE_LOAD_PTR-NEXT: store i32 [[ORIGIN]], i32* @__dfsan_retval_origin_tls, align 4
+  
+  %a = load i17, i17* %p, align 4
+  ret i17 %a
+}