diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -16,9 +16,38 @@
 /// issues within their own code.
 ///
 /// The analysis is based on automatic propagation of data flow labels (also
-/// known as taint labels) through a program as it performs computation.  Each
-/// byte of application memory is backed by two bytes of shadow memory which
-/// hold the label.  On Linux/x86_64, memory is laid out as follows:
+/// known as taint labels) through a program as it performs computation.
+///
+/// There are two possible memory layouts. In the first one, each byte of
+/// application memory is backed by a shadow memory byte. The shadow byte can
+/// represent up to 8 labels. To enable this you must specify the
+/// -dfsan-fast-8-labels flag. On Linux/x86_64, memory is then laid out as
+/// follows:
+///
+/// +--------------------+ 0x800000000000 (top of memory)
+/// | application memory |
+/// +--------------------+ 0x700000008000 (kAppAddr)
+/// |                    |
+/// |       unused       |
+/// |                    |
+/// +--------------------+ 0x300200000000 (kUnusedAddr)
+/// |    union table     |
+/// +--------------------+ 0x300000000000 (kUnionTableAddr)
+/// |       origin       |
+/// +--------------------+ 0x200000008000 (kOriginAddr)
+/// |   shadow memory    |
+/// +--------------------+ 0x100000008000 (kShadowAddr)
+/// |       unused       |
+/// +--------------------+ 0x000000010000
+/// | reserved by kernel |
+/// +--------------------+ 0x000000000000
+///
+///
+/// In the second memory layout, each byte of application memory is backed by
+/// two bytes of shadow memory which hold the label. That means we can represent
+/// either 16 labels (with -dfsan-fast-16-labels flag) or 2^16 labels (on the
+/// default legacy mode) per byte. On Linux/x86_64, memory is then laid out as
+/// follows:
 ///
 /// +--------------------+ 0x800000000000 (top of memory)
 /// | application memory |
@@ -36,6 +65,7 @@
 /// | reserved by kernel |
 /// +--------------------+ 0x000000000000
 ///
+///
 /// To derive a shadow memory address from an application memory address,
 /// bits 44-46 are cleared to bring the address into the range
 /// [0x000000008000,0x100000000000).  Then the address is shifted left by 1 to
@@ -200,6 +230,14 @@
              "labels to 16."),
     cl::Hidden, cl::init(false));
 
+// Use a distinct bit for each base label, enabling faster unions with less
+// instrumentation.  Limits the max number of base labels to 8.
+static cl::opt<bool> ClFast8Labels(
+    "dfsan-fast-8-labels",
+    cl::desc("Use more efficient instrumentation, limiting the number of "
+             "labels to 8."),
+    cl::Hidden, cl::init(false));
+
 // Controls whether the pass tracks the control flow of select instructions.
 static cl::opt<bool> ClTrackSelectControlFlow(
     "dfsan-track-select-control-flow",
@@ -341,8 +379,6 @@
   friend class DFSanVisitor;
 
   enum {
-    ShadowWidthBits = 16,
-    ShadowWidthBytes = ShadowWidthBits / 8,
     OriginWidthBits = 32,
     OriginWidthBytes = OriginWidthBits / 8
   };
@@ -383,6 +419,9 @@
     WK_Custom
   };
 
+  unsigned ShadowWidthBits;
+  unsigned ShadowWidthBytes;
+
   Module *Mod;
   LLVMContext *Ctx;
   Type *Int8Ptr;
@@ -419,7 +458,7 @@
   FunctionCallee DFSanUnionFn;
   FunctionCallee DFSanCheckedUnionFn;
   FunctionCallee DFSanUnionLoadFn;
-  FunctionCallee DFSanUnionLoadFast16LabelsFn;
+  FunctionCallee DFSanUnionLoadFastLabelsFn;
   FunctionCallee DFSanLoadLabelAndOriginFn;
   FunctionCallee DFSanUnimplementedFn;
   FunctionCallee DFSanSetLabelFn;
@@ -442,6 +481,7 @@
 
   Value *getShadowOffset(Value *Addr, IRBuilder<> &IRB);
   Value *getShadowAddress(Value *Addr, Instruction *Pos);
+  Value *getShadowAddress(Value *Addr, Instruction *Pos, Value *ShadowOffset);
   std::pair<Value *, Value *>
   getShadowOriginAddress(Value *Addr, Align InstAlignment, Instruction *Pos);
   bool isInstrumented(const Function *F);
@@ -462,6 +502,9 @@
 
   bool init(Module &M);
 
+  /// Returns whether fast8 or fast16 mode has been specified.
+  bool hasFastLabelsEnabled();
+
   /// Returns whether the pass tracks origins. Support only fast16 mode in TLS
   /// ABI mode.
   bool shouldTrackOrigins();
@@ -733,6 +776,14 @@
 
 DataFlowSanitizer::DataFlowSanitizer(
     const std::vector<std::string> &ABIListFiles) {
+  if (ClFast8Labels && ClFast16Labels) {
+    report_fatal_error(
+        "cannot set both -dfsan-fast-8-labels and -dfsan-fast-16-labels");
+  }
+
+  ShadowWidthBits = ClFast8Labels ? 8 : 16;
+  ShadowWidthBytes = ShadowWidthBits / 8;
+
   std::vector<std::string> AllABIListFiles(std::move(ABIListFiles));
   llvm::append_range(AllABIListFiles, ClABIListFiles);
   // FIXME: should we propagate vfs::FileSystem to this constructor?
@@ -827,6 +878,11 @@
   return isa<ConstantAggregateZero>(V);
 }
 
+bool DataFlowSanitizer::hasFastLabelsEnabled() {
+  static const bool HasFastLabelsEnabled = ClFast8Labels || ClFast16Labels;
+  return HasFastLabelsEnabled;
+}
+
 bool DataFlowSanitizer::shouldTrackOrigins() {
   static const bool ShouldTrackOrigins =
       ClTrackOrigins && getInstrumentedABI() == DataFlowSanitizer::IA_TLS &&
@@ -835,7 +891,8 @@
 }
 
 bool DataFlowSanitizer::shouldTrackFieldsAndIndices() {
-  return getInstrumentedABI() == DataFlowSanitizer::IA_TLS && ClFast16Labels;
+  return getInstrumentedABI() == DataFlowSanitizer::IA_TLS &&
+         hasFastLabelsEnabled();
 }
 
 Constant *DataFlowSanitizer::getZeroShadow(Type *OrigTy) {
@@ -1000,11 +1057,15 @@
 
   switch (TargetTriple.getArch()) {
   case Triple::x86_64:
-    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
+    ShadowPtrMask = ClFast8Labels
+                        ? ConstantInt::getSigned(IntptrTy, ~0x600000000000LL)
+                        : ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
     break;
   case Triple::mips64:
   case Triple::mips64el:
-    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL);
+    ShadowPtrMask = ClFast8Labels
+                        ? ConstantInt::getSigned(IntptrTy, ~0xE000000000LL)
+                        : ConstantInt::getSigned(IntptrTy, ~0xF000000000LL);
     break;
   case Triple::aarch64:
   case Triple::aarch64_be:
@@ -1238,7 +1299,7 @@
                          Attribute::ReadOnly);
     AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
                          Attribute::ZExt);
-    DFSanUnionLoadFast16LabelsFn = Mod->getOrInsertFunction(
+    DFSanUnionLoadFastLabelsFn = Mod->getOrInsertFunction(
         "__dfsan_union_load_fast16labels", DFSanUnionLoadFnTy, AL);
   }
   {
@@ -1290,7 +1351,7 @@
   DFSanRuntimeFunctions.insert(
       DFSanUnionLoadFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
-      DFSanUnionLoadFast16LabelsFn.getCallee()->stripPointerCasts());
+      DFSanUnionLoadFastLabelsFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
       DFSanLoadLabelAndOriginFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
@@ -1757,8 +1818,7 @@
   // Returns ((Addr & shadow_mask) + origin_base) & ~4UL
   IRBuilder<> IRB(Pos);
   Value *ShadowOffset = getShadowOffset(Addr, IRB);
-  Value *ShadowPtr = IRB.CreateIntToPtr(
-      IRB.CreateMul(ShadowOffset, ShadowPtrMul), PrimitiveShadowPtrTy);
+  Value *ShadowPtr = getShadowAddress(Addr, Pos, ShadowOffset);
   Value *OriginPtr = nullptr;
   if (shouldTrackOrigins()) {
     Value *OriginLong = IRB.CreateAdd(ShadowOffset, OriginBase);
@@ -1774,12 +1834,21 @@
   return {ShadowPtr, OriginPtr};
 }
 
+Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos,
+                                           Value *ShadowOffset) {
+  IRBuilder<> IRB(Pos);
+
+  if (!ShadowPtrMul->isOne())
+    ShadowOffset = IRB.CreateMul(ShadowOffset, ShadowPtrMul);
+
+  return IRB.CreateIntToPtr(ShadowOffset, PrimitiveShadowPtrTy);
+}
+
 Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) {
   // Returns (Addr & shadow_mask) x 2
   IRBuilder<> IRB(Pos);
   Value *ShadowOffset = getShadowOffset(Addr, IRB);
-  return IRB.CreateIntToPtr(IRB.CreateMul(ShadowOffset, ShadowPtrMul),
-                            PrimitiveShadowPtrTy);
+  return getShadowAddress(Addr, Pos, ShadowOffset);
 }
 
 Value *DFSanFunction::combineShadowsThenConvert(Type *T, Value *V1, Value *V2,
@@ -1829,7 +1898,7 @@
   Value *PV2 = collapseToPrimitiveShadow(V2, Pos);
 
   IRBuilder<> IRB(Pos);
-  if (ClFast16Labels) {
+  if (DFS.hasFastLabelsEnabled()) {
     CCS.Block = Pos->getParent();
     CCS.Shadow = IRB.CreateOr(PV1, PV2);
   } else if (AvoidNewBlocks) {
@@ -1978,27 +2047,53 @@
 std::pair<Value *, Value *> DFSanFunction::loadFast16ShadowFast(
     Value *ShadowAddr, Value *OriginAddr, uint64_t Size, Align ShadowAlign,
     Align OriginAlign, Value *FirstOrigin, Instruction *Pos) {
-  // First OR all the WideShadows, then OR individual shadows within the
-  // combined WideShadow. This is fewer instructions than ORing shadows
-  // individually.
   const bool ShouldTrackOrigins = DFS.shouldTrackOrigins();
+  const uint64_t ShadowSize = Size * DFS.ShadowWidthBytes;
+
+  assert(Size >= 4 && "Not large enough load size for fast path!");
+
+  // Used for origin tracking.
   std::vector<Value *> Shadows;
   std::vector<Value *> Origins;
+
+  // Load instructions in LLVM can have arbitrary byte sizes (e.g., 3, 12, 20)
+  // but this function is only used in a subset of cases that make it possible
+  // to optimize the instrumentation.
+  //
+  // Specifically, when the shadow size in bytes (i.e., loaded bytes x shadow
+  // per byte) is either:
+  // - a multiple of 8  (common)
+  // - equal to 4       (only for load32 in fast-8 mode)
+  //
+  // For the second case, we can fit the wide shadow in a 32-bit integer. In all
+  // other cases, we use a 64-bit integer to hold the wide shadow.
+  Type *WideShadowTy =
+      ShadowSize == 4 ? Type::getInt32Ty(*DFS.Ctx) : Type::getInt64Ty(*DFS.Ctx);
+
   IRBuilder<> IRB(Pos);
-  Value *WideAddr =
-      IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
+  Value *WideAddr = IRB.CreateBitCast(ShadowAddr, WideShadowTy->getPointerTo());
   Value *CombinedWideShadow =
-      IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
+      IRB.CreateAlignedLoad(WideShadowTy, WideAddr, ShadowAlign);
+
   if (ShouldTrackOrigins) {
     Shadows.push_back(CombinedWideShadow);
     Origins.push_back(FirstOrigin);
   }
-  for (uint64_t Ofs = 64 / DFS.ShadowWidthBits; Ofs != Size;
-       Ofs += 64 / DFS.ShadowWidthBits) {
-    WideAddr = IRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr,
+
+  // First OR all the WideShadows (i.e., 64bit or 32bit shadow chunks) linearly;
+  // then OR individual shadows within the combined WideShadow by binary ORing.
+  // This is fewer instructions than ORing shadows individually, since it
+  // needs logN shift/or instructions (N being the bytes of the combined wide
+  // shadow).
+  unsigned WideShadowBitWidth = WideShadowTy->getIntegerBitWidth();
+  const uint64_t BytesPerWideShadow = WideShadowBitWidth / DFS.ShadowWidthBits;
+
+  for (uint64_t ByteOfs = BytesPerWideShadow; ByteOfs < Size;
+       ByteOfs += BytesPerWideShadow) {
+    WideAddr = IRB.CreateGEP(WideShadowTy, WideAddr,
                              ConstantInt::get(DFS.IntptrTy, 1));
     Value *NextWideShadow =
-        IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
+        IRB.CreateAlignedLoad(WideShadowTy, WideAddr, ShadowAlign);
     CombinedWideShadow = IRB.CreateOr(CombinedWideShadow, NextWideShadow);
     if (ShouldTrackOrigins) {
       Shadows.push_back(NextWideShadow);
@@ -2008,7 +2103,8 @@
           IRB.CreateAlignedLoad(DFS.OriginTy, OriginAddr, OriginAlign));
     }
   }
-  for (unsigned Width = 32; Width >= DFS.ShadowWidthBits; Width >>= 1) {
+  for (unsigned Width = WideShadowBitWidth / 2; Width >= DFS.ShadowWidthBits;
+       Width >>= 1) {
     Value *ShrShadow = IRB.CreateLShr(CombinedWideShadow, Width);
     CombinedWideShadow = IRB.CreateOr(CombinedWideShadow, ShrShadow);
   }
@@ -2023,24 +2119,33 @@
                                            Align ShadowAlign,
                                            Instruction *Pos) {
   // Fast path for the common case where each byte has identical shadow: load
-  // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any
-  // shadow is non-equal.
+  // shadow 64 (or 32) bits at a time, fall out to a __dfsan_union_load call if
+  // any shadow is non-equal.
   BasicBlock *FallbackBB = BasicBlock::Create(*DFS.Ctx, "", F);
   IRBuilder<> FallbackIRB(FallbackBB);
   CallInst *FallbackCall = FallbackIRB.CreateCall(
       DFS.DFSanUnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
   FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
 
+  const uint64_t ShadowSize = Size * DFS.ShadowWidthBytes;
+  assert(Size >= 4 && "Not large enough load size for fast path!");
+
+  // Same as in loadFast16AShadowsFast. In the case of load32, we can fit the
+  // wide shadow in a 32-bit integer instead.
+  Type *WideShadowTy =
+      ShadowSize == 4 ? Type::getInt32Ty(*DFS.Ctx) : Type::getInt64Ty(*DFS.Ctx);
+
   // Compare each of the shadows stored in the loaded 64 bits to each other,
   // by computing (WideShadow rotl ShadowWidthBits) == WideShadow.
   IRBuilder<> IRB(Pos);
-  Value *WideAddr =
-      IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
+  unsigned WideShadowBitWidth = WideShadowTy->getIntegerBitWidth();
+  Value *WideAddr = IRB.CreateBitCast(ShadowAddr, WideShadowTy->getPointerTo());
   Value *WideShadow =
-      IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
+      IRB.CreateAlignedLoad(WideShadowTy, WideAddr, ShadowAlign);
   Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.PrimitiveShadowTy);
   Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidthBits);
-  Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidthBits);
+  Value *ShrShadow =
+      IRB.CreateLShr(WideShadow, WideShadowBitWidth - DFS.ShadowWidthBits);
   Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow);
   Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow);
 
@@ -2063,15 +2168,17 @@
   ReplaceInstWithInst(Head->getTerminator(), LastBr);
   DT.addNewBlock(FallbackBB, Head);
 
-  for (uint64_t Ofs = 64 / DFS.ShadowWidthBits; Ofs != Size;
-       Ofs += 64 / DFS.ShadowWidthBits) {
+  const uint64_t BytesPerWideShadow = WideShadowBitWidth / DFS.ShadowWidthBits;
+
+  for (uint64_t ByteOfs = BytesPerWideShadow; ByteOfs < Size;
+       ByteOfs += BytesPerWideShadow) {
     BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F);
     DT.addNewBlock(NextBB, LastBr->getParent());
     IRBuilder<> NextIRB(NextBB);
-    WideAddr = NextIRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr,
+    WideAddr = NextIRB.CreateGEP(WideShadowTy, WideAddr,
                                  ConstantInt::get(DFS.IntptrTy, 1));
     Value *NextWideShadow =
-        NextIRB.CreateAlignedLoad(NextIRB.getInt64Ty(), WideAddr, ShadowAlign);
+        NextIRB.CreateAlignedLoad(WideShadowTy, WideAddr, ShadowAlign);
     ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow);
     LastBr->setSuccessor(0, NextBB);
     LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB);
@@ -2158,6 +2265,8 @@
     Origin = IRB.CreateAlignedLoad(DFS.OriginTy, OriginAddr, OriginAlign);
   }
 
+  // When the byte size is small enough, we can load the shadow directly with
+  // just a few instructions.
   switch (Size) {
   case 1: {
     LoadInst *LI = new LoadInst(DFS.PrimitiveShadowTy, ShadowAddr, "", Pos);
@@ -2175,17 +2284,21 @@
     return {combineShadows(Load, Load1, Pos), Origin};
   }
   }
+  uint64_t ShadowSize = Size * DFS.ShadowWidthBytes;
+  bool HasSizeForFastPath = ShadowSize % 8 == 0 || ShadowSize == 4;
+  bool HasFastLabelsEnabled = DFS.hasFastLabelsEnabled();
 
-  if (ClFast16Labels && Size % (64 / DFS.ShadowWidthBits) == 0)
+  if (HasFastLabelsEnabled && HasSizeForFastPath)
     return loadFast16ShadowFast(ShadowAddr, OriginAddr, Size, ShadowAlign,
                                 OriginAlign, Origin, Pos);
 
-  if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidthBits) == 0)
+  if (!AvoidNewBlocks && HasSizeForFastPath)
     return {loadLegacyShadowFast(ShadowAddr, Size, ShadowAlign, Pos), Origin};
 
   IRBuilder<> IRB(Pos);
-  FunctionCallee &UnionLoadFn =
-      ClFast16Labels ? DFS.DFSanUnionLoadFast16LabelsFn : DFS.DFSanUnionLoadFn;
+  FunctionCallee &UnionLoadFn = HasFastLabelsEnabled
+                                    ? DFS.DFSanUnionLoadFastLabelsFn
+                                    : DFS.DFSanUnionLoadFn;
   CallInst *FallbackCall = IRB.CreateCall(
       UnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
   FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
@@ -2406,7 +2519,10 @@
   std::tie(ShadowAddr, OriginAddr) =
       DFS.getShadowOriginAddress(Addr, InstAlignment, Pos);
 
-  const unsigned ShadowVecSize = 128 / DFS.ShadowWidthBits;
+  const unsigned ShadowVecSize = 8;
+  assert(ShadowVecSize * DFS.ShadowWidthBits <= 128 &&
+         "Shadow vector is too large!");
+
   uint64_t Offset = 0;
   uint64_t LeftSize = Size;
   if (LeftSize >= ShadowVecSize) {
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/abilist.ll b/llvm/test/Instrumentation/DataFlowSanitizer/abilist.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/abilist.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/abilist.ll
@@ -1,4 +1,6 @@
 ; RUN: opt < %s -dfsan -dfsan-args-abi -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s
+; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-args-abi -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-args-abi -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/abilist_aggregate.ll b/llvm/test/Instrumentation/DataFlowSanitizer/abilist_aggregate.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/abilist_aggregate.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/abilist_aggregate.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s --check-prefixes=CHECK,TLS_ABI
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s --check-prefixes=CHECK,TLS_ABI
 ; RUN: opt < %s -dfsan -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s --check-prefixes=CHECK,LEGACY
 ; RUN: opt < %s -dfsan -dfsan-args-abi -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s --check-prefixes=CHECK,ARGS_ABI
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/array.ll b/llvm/test/Instrumentation/DataFlowSanitizer/array.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/array.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/array.ll
@@ -1,10 +1,15 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,LEGACY
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-event-callbacks=true -S | FileCheck %s --check-prefixes=CHECK,EVENT_CALLBACKS
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-event-callbacks=true -S | FileCheck %s --check-prefixes=CHECK,EVENT_CALLBACKS
 ; RUN: opt < %s -dfsan -dfsan-args-abi -S | FileCheck %s --check-prefixes=CHECK,ARGS_ABI
-; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST16
+; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-combine-pointer-labels-on-load=false -S | FileCheck %s --check-prefixes=CHECK,NO_COMBINE_LOAD_PTR
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-combine-pointer-labels-on-store=true -S | FileCheck %s --check-prefixes=CHECK,COMBINE_STORE_PTR
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-debug-nonzero-labels -S | FileCheck %s --check-prefixes=CHECK,DEBUG_NONZERO_LABELS
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-combine-pointer-labels-on-load=false -S | FileCheck %s --check-prefixes=CHECK,NO_COMBINE_LOAD_PTR
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-combine-pointer-labels-on-store=true -S | FileCheck %s --check-prefixes=CHECK,COMBINE_STORE_PTR
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-debug-nonzero-labels -S | FileCheck %s --check-prefixes=CHECK,DEBUG_NONZERO_LABELS
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -90,12 +95,12 @@
   ; EVENT_CALLBACKS: [[L:%.*]] = or i[[#SBITS]]
   ; EVENT_CALLBACKS: call void @__dfsan_load_callback(i[[#SBITS]] [[L]], i8* {{.*}})
 
-  ; FAST16: @"dfs$load_array1"
-  ; FAST16: [[P:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]]
-  ; FAST16: [[L:%.*]] = load i[[#SBITS]], i[[#SBITS]]* {{.*}}, align [[#SBYTES]]
-  ; FAST16: [[U:%.*]] = or i[[#SBITS]] [[L]], [[P]]
-  ; FAST16: [[S1:%.*]] = insertvalue [1 x i[[#SBITS]]] undef, i[[#SBITS]] [[U]], 0
-  ; FAST16: store [1 x i[[#SBITS]]] [[S1]], [1 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [1 x i[[#SBITS]]]*), align [[ALIGN]]
+  ; FAST: @"dfs$load_array1"
+  ; FAST: [[P:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]]
+  ; FAST: [[L:%.*]] = load i[[#SBITS]], i[[#SBITS]]* {{.*}}, align [[#SBYTES]]
+  ; FAST: [[U:%.*]] = or i[[#SBITS]] [[L]], [[P]]
+  ; FAST: [[S1:%.*]] = insertvalue [1 x i[[#SBITS]]] undef, i[[#SBITS]] [[U]], 0
+  ; FAST: store [1 x i[[#SBITS]]] [[S1]], [1 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [1 x i[[#SBITS]]]*), align [[ALIGN]]
 
   ; LEGACY: @"dfs$load_array1"
   ; LEGACY: [[P:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]]
@@ -123,13 +128,13 @@
   ; EVENT_CALLBACKS: [[O2:%.*]] = or i[[#SBITS]] [[O1]]
   ; EVENT_CALLBACKS: call void @__dfsan_load_callback(i[[#SBITS]] [[O2]], i8* {{.*}})
 
-  ; FAST16: @"dfs$load_array2"
-  ; FAST16: [[P:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]]
-  ; FAST16: [[O:%.*]] = or i[[#SBITS]]
-  ; FAST16: [[U:%.*]] = or i[[#SBITS]] [[O]], [[P]]
-  ; FAST16: [[S:%.*]] = insertvalue [2 x i[[#SBITS]]] undef, i[[#SBITS]] [[U]], 0
-  ; FAST16: [[S1:%.*]] = insertvalue [2 x i[[#SBITS]]] [[S]], i[[#SBITS]] [[U]], 1
-  ; FAST16: store [2 x i[[#SBITS]]] [[S1]], [2 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [2 x i[[#SBITS]]]*), align [[ALIGN]]
+  ; FAST: @"dfs$load_array2"
+  ; FAST: [[P:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]]
+  ; FAST: [[O:%.*]] = or i[[#SBITS]]
+  ; FAST: [[U:%.*]] = or i[[#SBITS]] [[O]], [[P]]
+  ; FAST: [[S:%.*]] = insertvalue [2 x i[[#SBITS]]] undef, i[[#SBITS]] [[U]], 0
+  ; FAST: [[S1:%.*]] = insertvalue [2 x i[[#SBITS]]] [[S]], i[[#SBITS]] [[U]], 1
+  ; FAST: store [2 x i[[#SBITS]]] [[S1]], [2 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [2 x i[[#SBITS]]]*), align [[ALIGN]]
   %a = load [2 x i1], [2 x i1]* %p
   ret [2 x i1] %a
 }
@@ -150,14 +155,14 @@
   ; EVENT_CALLBACKS: [[O3:%.*]] = or i[[#SBITS]] [[O2]]
   ; EVENT_CALLBACKS: call void @__dfsan_load_callback(i[[#SBITS]] [[O3]], i8* {{.*}})
 
-  ; FAST16: @"dfs$load_array4"
-  ; FAST16: [[T:%.*]] = trunc i[[#mul(4, SBITS)]] {{.*}} to i[[#SBITS]]
-  ; FAST16: [[O:%.*]] = or i[[#SBITS]] [[T]]
-  ; FAST16: [[S1:%.*]] = insertvalue [4 x i[[#SBITS]]] undef, i[[#SBITS]] [[O]], 0
-  ; FAST16: [[S2:%.*]] = insertvalue [4 x i[[#SBITS]]] [[S1]], i[[#SBITS]] [[O]], 1
-  ; FAST16: [[S3:%.*]] = insertvalue [4 x i[[#SBITS]]] [[S2]], i[[#SBITS]] [[O]], 2
-  ; FAST16: [[S4:%.*]] = insertvalue [4 x i[[#SBITS]]] [[S3]], i[[#SBITS]] [[O]], 3
-  ; FAST16: store [4 x i[[#SBITS]]] [[S4]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align 2
+  ; FAST: @"dfs$load_array4"
+  ; FAST: [[T:%.*]] = trunc i[[#mul(4, SBITS)]] {{.*}} to i[[#SBITS]]
+  ; FAST: [[O:%.*]] = or i[[#SBITS]] [[T]]
+  ; FAST: [[S1:%.*]] = insertvalue [4 x i[[#SBITS]]] undef, i[[#SBITS]] [[O]], 0
+  ; FAST: [[S2:%.*]] = insertvalue [4 x i[[#SBITS]]] [[S1]], i[[#SBITS]] [[O]], 1
+  ; FAST: [[S3:%.*]] = insertvalue [4 x i[[#SBITS]]] [[S2]], i[[#SBITS]] [[O]], 2
+  ; FAST: [[S4:%.*]] = insertvalue [4 x i[[#SBITS]]] [[S3]], i[[#SBITS]] [[O]], 3
+  ; FAST: store [4 x i[[#SBITS]]] [[S4]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align 2
 
   ; LEGACY: @"dfs$load_array4"
   ; LEGACY: [[P:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]]
@@ -191,25 +196,25 @@
 }
 
 define void @store_alloca_array([4 x i1] %a) {
-  ; FAST16: @"dfs$store_alloca_array"
-  ; FAST16: [[S:%.*]] = load [4 x i[[#SBITS]]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [4 x i[[#SBITS]]]*), align [[ALIGN:2]]
-  ; FAST16: [[SP:%.*]] = alloca i[[#SBITS]], align [[#SBYTES]]
-  ; FAST16: [[E0:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 0
-  ; FAST16: [[E1:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 1
-  ; FAST16: [[E01:%.*]] = or i[[#SBITS]] [[E0]], [[E1]]
-  ; FAST16: [[E2:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 2
-  ; FAST16: [[E012:%.*]] = or i[[#SBITS]] [[E01]], [[E2]]
-  ; FAST16: [[E3:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 3
-  ; FAST16: [[E0123:%.*]] = or i[[#SBITS]] [[E012]], [[E3]]
-  ; FAST16: store i[[#SBITS]] [[E0123]], i[[#SBITS]]* [[SP]], align [[#SBYTES]]
+  ; FAST: @"dfs$store_alloca_array"
+  ; FAST: [[S:%.*]] = load [4 x i[[#SBITS]]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [4 x i[[#SBITS]]]*), align [[ALIGN:2]]
+  ; FAST: [[SP:%.*]] = alloca i[[#SBITS]], align [[#SBYTES]]
+  ; FAST: [[E0:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 0
+  ; FAST: [[E1:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 1
+  ; FAST: [[E01:%.*]] = or i[[#SBITS]] [[E0]], [[E1]]
+  ; FAST: [[E2:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 2
+  ; FAST: [[E012:%.*]] = or i[[#SBITS]] [[E01]], [[E2]]
+  ; FAST: [[E3:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 3
+  ; FAST: [[E0123:%.*]] = or i[[#SBITS]] [[E012]], [[E3]]
+  ; FAST: store i[[#SBITS]] [[E0123]], i[[#SBITS]]* [[SP]], align [[#SBYTES]]
   %p = alloca [4 x i1]
   store [4 x i1] %a, [4 x i1]* %p
   ret void
 }
 
 define void @store_zero_array([4 x i1]* %p) {
-  ; FAST16: @"dfs$store_zero_array"
-  ; FAST16: store i[[#mul(4, SBITS)]] 0, i[[#mul(4, SBITS)]]* {{.*}}
+  ; FAST: @"dfs$store_zero_array"
+  ; FAST: store i[[#mul(4, SBITS)]] 0, i[[#mul(4, SBITS)]]* {{.*}}
   store [4 x i1] zeroinitializer, [4 x i1]* %p
   ret void
 }
@@ -227,15 +232,15 @@
   ; EVENT_CALLBACKS: [[P:%.*]] = bitcast [2 x i1]* %p to i8*
   ; EVENT_CALLBACKS: call void @__dfsan_store_callback(i[[#SBITS]] [[E12]], i8* [[P]])
 
-  ; FAST16: @"dfs$store_array2"
-  ; FAST16: [[S:%.*]] = load [2 x i[[#SBITS]]], [2 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [2 x i[[#SBITS]]]*), align [[ALIGN:2]]
-  ; FAST16: [[E1:%.*]] = extractvalue [2 x i[[#SBITS]]] [[S]], 0
-  ; FAST16: [[E2:%.*]] = extractvalue [2 x i[[#SBITS]]] [[S]], 1
-  ; FAST16: [[E12:%.*]] = or i[[#SBITS]] [[E1]], [[E2]]
-  ; FAST16: [[SP0:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[SP:%.*]], i32 0
-  ; FAST16: store i[[#SBITS]] [[E12]], i[[#SBITS]]* [[SP0]], align [[#SBYTES]]
-  ; FAST16: [[SP1:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[SP]], i32 1
-  ; FAST16: store i[[#SBITS]] [[E12]], i[[#SBITS]]* [[SP1]], align [[#SBYTES]]
+  ; FAST: @"dfs$store_array2"
+  ; FAST: [[S:%.*]] = load [2 x i[[#SBITS]]], [2 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [2 x i[[#SBITS]]]*), align [[ALIGN:2]]
+  ; FAST: [[E1:%.*]] = extractvalue [2 x i[[#SBITS]]] [[S]], 0
+  ; FAST: [[E2:%.*]] = extractvalue [2 x i[[#SBITS]]] [[S]], 1
+  ; FAST: [[E12:%.*]] = or i[[#SBITS]] [[E1]], [[E2]]
+  ; FAST: [[SP0:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[SP:%.*]], i32 0
+  ; FAST: store i[[#SBITS]] [[E12]], i[[#SBITS]]* [[SP0]], align [[#SBYTES]]
+  ; FAST: [[SP1:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[SP]], i32 1
+  ; FAST: store i[[#SBITS]] [[E12]], i[[#SBITS]]* [[SP1]], align [[#SBYTES]]
 
   ; COMBINE_STORE_PTR: @"dfs$store_array2"
   ; COMBINE_STORE_PTR: [[O:%.*]] = or i[[#SBITS]]
@@ -250,72 +255,72 @@
 }
 
 define void @store_array17([17 x i1] %a, [17 x i1]* %p) {
-  ; FAST16: @"dfs$store_array17"
-  ; FAST16: %[[#R:]]   = load [17 x i[[#SBITS]]], [17 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [17 x i[[#SBITS]]]*), align 2
-  ; FAST16: %[[#R+1]]  = extractvalue [17 x i[[#SBITS]]] %[[#R]], 0
-  ; FAST16: %[[#R+2]]  = extractvalue [17 x i[[#SBITS]]] %[[#R]], 1
-  ; FAST16: %[[#R+3]]  = or i[[#SBITS]] %[[#R+1]], %[[#R+2]]
-  ; FAST16: %[[#R+4]]  = extractvalue [17 x i[[#SBITS]]] %[[#R]], 2
-  ; FAST16: %[[#R+5]]  = or i[[#SBITS]] %[[#R+3]], %[[#R+4]]
-  ; FAST16: %[[#R+6]]  = extractvalue [17 x i[[#SBITS]]] %[[#R]], 3
-  ; FAST16: %[[#R+7]]  = or i[[#SBITS]] %[[#R+5]], %[[#R+6]]
-  ; FAST16: %[[#R+8]]  = extractvalue [17 x i[[#SBITS]]] %[[#R]], 4
-  ; FAST16: %[[#R+9]]  = or i[[#SBITS]] %[[#R+7]], %[[#R+8]]
-  ; FAST16: %[[#R+10]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 5
-  ; FAST16: %[[#R+11]] = or i[[#SBITS]] %[[#R+9]], %[[#R+10]]
-  ; FAST16: %[[#R+12]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 6
-  ; FAST16: %[[#R+13]] = or i[[#SBITS]] %[[#R+11]], %[[#R+12]]
-  ; FAST16: %[[#R+14]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 7
-  ; FAST16: %[[#R+15]] = or i[[#SBITS]] %[[#R+13]], %[[#R+14]]
-  ; FAST16: %[[#R+16]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 8
-  ; FAST16: %[[#R+17]] = or i[[#SBITS]] %[[#R+15]], %[[#R+16]]
-  ; FAST16: %[[#R+18]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 9
-  ; FAST16: %[[#R+19]] = or i[[#SBITS]] %[[#R+17]], %[[#R+18]]
-  ; FAST16: %[[#R+20]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 10
-  ; FAST16: %[[#R+21]] = or i[[#SBITS]] %[[#R+19]], %[[#R+20]]
-  ; FAST16: %[[#R+22]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 11
-  ; FAST16: %[[#R+23]] = or i[[#SBITS]] %[[#R+21]], %[[#R+22]]
-  ; FAST16: %[[#R+24]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 12
-  ; FAST16: %[[#R+25]] = or i[[#SBITS]] %[[#R+23]], %[[#R+24]]
-  ; FAST16: %[[#R+26]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 13
-  ; FAST16: %[[#R+27]] = or i[[#SBITS]] %[[#R+25]], %[[#R+26]]
-  ; FAST16: %[[#R+28]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 14
-  ; FAST16: %[[#R+29]] = or i[[#SBITS]] %[[#R+27]], %[[#R+28]]
-  ; FAST16: %[[#R+30]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 15
-  ; FAST16: %[[#R+31]] = or i[[#SBITS]] %[[#R+29]], %[[#R+30]]
-  ; FAST16: %[[#R+32]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 16
-  ; FAST16: %[[#R+33]] = or i[[#SBITS]] %[[#R+31]], %[[#R+32]]
-  ; FAST16: %[[#VREG:]]  = insertelement <8 x i[[#SBITS]]> undef, i[[#SBITS]] %[[#R+33]], i32 0
-  ; FAST16: %[[#VREG+1]] = insertelement <8 x i[[#SBITS]]> %[[#VREG]], i[[#SBITS]] %[[#R+33]], i32 1
-  ; FAST16: %[[#VREG+2]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+1]], i[[#SBITS]] %[[#R+33]], i32 2
-  ; FAST16: %[[#VREG+3]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+2]], i[[#SBITS]] %[[#R+33]], i32 3
-  ; FAST16: %[[#VREG+4]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+3]], i[[#SBITS]] %[[#R+33]], i32 4
-  ; FAST16: %[[#VREG+5]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+4]], i[[#SBITS]] %[[#R+33]], i32 5
-  ; FAST16: %[[#VREG+6]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+5]], i[[#SBITS]] %[[#R+33]], i32 6
-  ; FAST16: %[[#VREG+7]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+6]], i[[#SBITS]] %[[#R+33]], i32 7
-  ; FAST16: %[[#VREG+8]] = bitcast i[[#SBITS]]* %[[P:.*]] to <8 x i[[#SBITS]]>*
-  ; FAST16: %[[#VREG+9]]  = getelementptr <8 x i[[#SBITS]]>, <8 x i[[#SBITS]]>* %[[#VREG+8]], i32 0
-  ; FAST16: store <8 x i[[#SBITS]]> %[[#VREG+7]], <8 x i[[#SBITS]]>* %[[#VREG+9]], align [[#SBYTES]]
-  ; FAST16: %[[#VREG+10]] = getelementptr <8 x i[[#SBITS]]>, <8 x i[[#SBITS]]>* %[[#VREG+8]], i32 1
-  ; FAST16: store <8 x i[[#SBITS]]> %[[#VREG+7]], <8 x i[[#SBITS]]>* %[[#VREG+10]], align [[#SBYTES]]
-  ; FAST16: %[[#VREG+11]] = getelementptr i[[#SBITS]], i[[#SBITS]]* %[[P]], i32 16
-  ; FAST16: store i[[#SBITS]] %[[#R+33]], i[[#SBITS]]* %[[#VREG+11]], align [[#SBYTES]]
+  ; FAST: @"dfs$store_array17"
+  ; FAST: %[[#R:]]   = load [17 x i[[#SBITS]]], [17 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [17 x i[[#SBITS]]]*), align 2
+  ; FAST: %[[#R+1]]  = extractvalue [17 x i[[#SBITS]]] %[[#R]], 0
+  ; FAST: %[[#R+2]]  = extractvalue [17 x i[[#SBITS]]] %[[#R]], 1
+  ; FAST: %[[#R+3]]  = or i[[#SBITS]] %[[#R+1]], %[[#R+2]]
+  ; FAST: %[[#R+4]]  = extractvalue [17 x i[[#SBITS]]] %[[#R]], 2
+  ; FAST: %[[#R+5]]  = or i[[#SBITS]] %[[#R+3]], %[[#R+4]]
+  ; FAST: %[[#R+6]]  = extractvalue [17 x i[[#SBITS]]] %[[#R]], 3
+  ; FAST: %[[#R+7]]  = or i[[#SBITS]] %[[#R+5]], %[[#R+6]]
+  ; FAST: %[[#R+8]]  = extractvalue [17 x i[[#SBITS]]] %[[#R]], 4
+  ; FAST: %[[#R+9]]  = or i[[#SBITS]] %[[#R+7]], %[[#R+8]]
+  ; FAST: %[[#R+10]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 5
+  ; FAST: %[[#R+11]] = or i[[#SBITS]] %[[#R+9]], %[[#R+10]]
+  ; FAST: %[[#R+12]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 6
+  ; FAST: %[[#R+13]] = or i[[#SBITS]] %[[#R+11]], %[[#R+12]]
+  ; FAST: %[[#R+14]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 7
+  ; FAST: %[[#R+15]] = or i[[#SBITS]] %[[#R+13]], %[[#R+14]]
+  ; FAST: %[[#R+16]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 8
+  ; FAST: %[[#R+17]] = or i[[#SBITS]] %[[#R+15]], %[[#R+16]]
+  ; FAST: %[[#R+18]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 9
+  ; FAST: %[[#R+19]] = or i[[#SBITS]] %[[#R+17]], %[[#R+18]]
+  ; FAST: %[[#R+20]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 10
+  ; FAST: %[[#R+21]] = or i[[#SBITS]] %[[#R+19]], %[[#R+20]]
+  ; FAST: %[[#R+22]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 11
+  ; FAST: %[[#R+23]] = or i[[#SBITS]] %[[#R+21]], %[[#R+22]]
+  ; FAST: %[[#R+24]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 12
+  ; FAST: %[[#R+25]] = or i[[#SBITS]] %[[#R+23]], %[[#R+24]]
+  ; FAST: %[[#R+26]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 13
+  ; FAST: %[[#R+27]] = or i[[#SBITS]] %[[#R+25]], %[[#R+26]]
+  ; FAST: %[[#R+28]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 14
+  ; FAST: %[[#R+29]] = or i[[#SBITS]] %[[#R+27]], %[[#R+28]]
+  ; FAST: %[[#R+30]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 15
+  ; FAST: %[[#R+31]] = or i[[#SBITS]] %[[#R+29]], %[[#R+30]]
+  ; FAST: %[[#R+32]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 16
+  ; FAST: %[[#R+33]] = or i[[#SBITS]] %[[#R+31]], %[[#R+32]]
+  ; FAST: %[[#VREG:]]  = insertelement <8 x i[[#SBITS]]> undef, i[[#SBITS]] %[[#R+33]], i32 0
+  ; FAST: %[[#VREG+1]] = insertelement <8 x i[[#SBITS]]> %[[#VREG]], i[[#SBITS]] %[[#R+33]], i32 1
+  ; FAST: %[[#VREG+2]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+1]], i[[#SBITS]] %[[#R+33]], i32 2
+  ; FAST: %[[#VREG+3]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+2]], i[[#SBITS]] %[[#R+33]], i32 3
+  ; FAST: %[[#VREG+4]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+3]], i[[#SBITS]] %[[#R+33]], i32 4
+  ; FAST: %[[#VREG+5]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+4]], i[[#SBITS]] %[[#R+33]], i32 5
+  ; FAST: %[[#VREG+6]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+5]], i[[#SBITS]] %[[#R+33]], i32 6
+  ; FAST: %[[#VREG+7]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+6]], i[[#SBITS]] %[[#R+33]], i32 7
+  ; FAST: %[[#VREG+8]] = bitcast i[[#SBITS]]* %[[P:.*]] to <8 x i[[#SBITS]]>*
+  ; FAST: %[[#VREG+9]]  = getelementptr <8 x i[[#SBITS]]>, <8 x i[[#SBITS]]>* %[[#VREG+8]], i32 0
+  ; FAST: store <8 x i[[#SBITS]]> %[[#VREG+7]], <8 x i[[#SBITS]]>* %[[#VREG+9]], align [[#SBYTES]]
+  ; FAST: %[[#VREG+10]] = getelementptr <8 x i[[#SBITS]]>, <8 x i[[#SBITS]]>* %[[#VREG+8]], i32 1
+  ; FAST: store <8 x i[[#SBITS]]> %[[#VREG+7]], <8 x i[[#SBITS]]>* %[[#VREG+10]], align [[#SBYTES]]
+  ; FAST: %[[#VREG+11]] = getelementptr i[[#SBITS]], i[[#SBITS]]* %[[P]], i32 16
+  ; FAST: store i[[#SBITS]] %[[#R+33]], i[[#SBITS]]* %[[#VREG+11]], align [[#SBYTES]]
   store [17 x i1] %a, [17 x i1]* %p
   ret void
 }
 
 define [2 x i32] @const_array() {
-  ; FAST16: @"dfs$const_array"
-  ; FAST16: store [2 x i[[#SBITS]]] zeroinitializer, [2 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [2 x i[[#SBITS]]]*), align 2
+  ; FAST: @"dfs$const_array"
+  ; FAST: store [2 x i[[#SBITS]]] zeroinitializer, [2 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [2 x i[[#SBITS]]]*), align 2
   ret [2 x i32] [ i32 42, i32 11 ]
 }
 
 define [4 x i8] @call_array([4 x i8] %a) {
-  ; FAST16-LABEL: @"dfs$call_array"
-  ; FAST16: %[[#R:]] = load [4 x i[[#SBITS]]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [4 x i[[#SBITS]]]*), align [[ALIGN:2]]
-  ; FAST16: store [4 x i[[#SBITS]]] %[[#R]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]]
-  ; FAST16: %_dfsret = load [4 x i[[#SBITS]]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]]
-  ; FAST16: store [4 x i[[#SBITS]]] %_dfsret, [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]]
+  ; FAST-LABEL: @"dfs$call_array"
+  ; FAST: %[[#R:]] = load [4 x i[[#SBITS]]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [4 x i[[#SBITS]]]*), align [[ALIGN:2]]
+  ; FAST: store [4 x i[[#SBITS]]] %[[#R]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]]
+  ; FAST: %_dfsret = load [4 x i[[#SBITS]]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]]
+  ; FAST: store [4 x i[[#SBITS]]] %_dfsret, [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]]
 
   %r = call [4 x i8] @pass_array([4 x i8] %a)
   ret [4 x i8] %r
@@ -324,31 +329,31 @@
 %LargeArr = type [1000 x i8]
 
 define i8 @fun_with_large_args(i1 %i, %LargeArr %a) {
-  ; FAST16: @"dfs$fun_with_large_args"
-  ; FAST16: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2
+  ; FAST: @"dfs$fun_with_large_args"
+  ; FAST: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2
   %r = extractvalue %LargeArr %a, 0
   ret i8 %r
 }
 
 define %LargeArr @fun_with_large_ret() {
-  ; FAST16: @"dfs$fun_with_large_ret"
-  ; FAST16-NEXT: ret  [1000 x i8] zeroinitializer
+  ; FAST: @"dfs$fun_with_large_ret"
+  ; FAST-NEXT: ret  [1000 x i8] zeroinitializer
   ret %LargeArr zeroinitializer
 }
 
 define i8 @call_fun_with_large_ret() {
-  ; FAST16: @"dfs$call_fun_with_large_ret"
-  ; FAST16: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2
+  ; FAST: @"dfs$call_fun_with_large_ret"
+  ; FAST: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2
   %r = call %LargeArr @fun_with_large_ret()
   %e = extractvalue %LargeArr %r, 0
   ret i8 %e
 }
 
 define i8 @call_fun_with_large_args(i1 %i, %LargeArr %a) {
-  ; FAST16: @"dfs$call_fun_with_large_args"
-  ; FAST16: [[I:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]]
-  ; FAST16: store i[[#SBITS]] [[I]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
-  ; FAST16: %r = call i8 @"dfs$fun_with_large_args"(i1 %i, [1000 x i8] %a)
+  ; FAST: @"dfs$call_fun_with_large_args"
+  ; FAST: [[I:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]]
+  ; FAST: store i[[#SBITS]] [[I]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; FAST: %r = call i8 @"dfs$fun_with_large_args"(i1 %i, [1000 x i8] %a)
 
   %r = call i8 @fun_with_large_args(i1 %i, %LargeArr %a)
   ret i8 %r
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll b/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK16
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK
 ;
 ; The patterns about origins cannot be tested until the origin tracking feature is complete.
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll b/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,CHECK_NO_ORIGIN -DSHADOW_MASK=-123145302310913
 ; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK_ORIGIN -DSHADOW_MASK=-123145302310913
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels=true  -S | FileCheck %s --check-prefixes=CHECK,CHECK_NO_ORIGIN -DSHADOW_MASK=-105553116266497
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/call.ll b/llvm/test/Instrumentation/DataFlowSanitizer/call.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/call.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/call.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -S | FileCheck %s
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels -S | FileCheck %s
 ; RUN: opt < %s -passes=dfsan -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/external_mask.ll b/llvm/test/Instrumentation/DataFlowSanitizer/external_mask.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/external_mask.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/external_mask.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,CHECK16
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK16
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/fast16labels.ll b/llvm/test/Instrumentation/DataFlowSanitizer/fast16labels.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/fast16labels.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/fast16labels.ll
@@ -1,6 +1,7 @@
 ; Test that -dfsan-fast-16-labels mode uses inline ORs rather than calling
 ; __dfsan_union or __dfsan_union_load.
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -S | FileCheck %s --implicit-check-not="call{{.*}}__dfsan_union" --check-prefixes=CHECK,CHECK16
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels -S | FileCheck %s --implicit-check-not="call{{.*}}__dfsan_union" --check-prefixes=CHECK,CHECK8
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -13,7 +14,7 @@
   ; CHECK-LABEL: define i8 @"dfs$add"
   ; CHECK-DAG: %[[ALABEL:.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]]
   ; CHECK-DAG: %[[BLABEL:.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN]]
-  ; CHECK: %[[ADDLABEL:.*]] = or i16 %[[ALABEL]], %[[BLABEL]]
+  ; CHECK: %[[ADDLABEL:.*]] = or i[[#SBITS]] %[[ALABEL]], %[[BLABEL]]
   ; CHECK: %c = add i8 %a, %b
   ; CHECK: store i[[#SBITS]] %[[ADDLABEL]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
   ; CHECK: ret i8 %c
@@ -24,7 +25,7 @@
 define i8 @load8(i8* %p) {
   ; CHECK-LABEL:  define i8 @"dfs$load8"
   ; CHECK-SAME:   (i8* %[[PADDR:.*]])
-  ; CHECK-NEXT:   %[[#ARG:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i16*), align [[ALIGN]]
+  ; CHECK-NEXT:   %[[#ARG:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
   ; CHECK-NEXT:   %[[#R:]] = ptrtoint i8* %[[PADDR]] to i64
   ; CHECK-NEXT:   %[[#PS:R+1]] = and i64 %[[#R]], [[#%.10d,MASK:]]
   ; CHECK16-NEXT: %[[#PS:R+2]] = mul i64 %[[#R+1]], 2
@@ -106,6 +107,16 @@
   ; CHECK16-NEXT: %[[#WS+5]]        = trunc i64 %[[#WS+4]] to i[[#SBITS]]
   ; CHECK16-NEXT: %[[#S_OUT:]]      = or i[[#SBITS]] %[[#WS+5]], %[[#ARG]]
 
+  ; COMM: On fast8, no need to OR the wide shadow but one more shift is needed.
+  ; CHECK8-NEXT: %[[#WS+1]]         = lshr i64 %[[#WS]], 32
+  ; CHECK8-NEXT: %[[#WS+2]]         = or i64 %[[#WS]], %[[#WS+1]]
+  ; CHECK8-NEXT: %[[#WS+3]]         = lshr i64 %[[#WS+2]], 16
+  ; CHECK8-NEXT: %[[#WS+4]]         = or i64 %[[#WS+2]], %[[#WS+3]]
+  ; CHECK8-NEXT: %[[#WS+5]]         = lshr i64 %[[#WS+4]], 8
+  ; CHECK8-NEXT: %[[#WS+6]]         = or i64 %[[#WS+4]], %[[#WS+5]]
+  ; CHECK8-NEXT: %[[#WS+7]]         = trunc i64 %[[#WS+6]] to i[[#SBITS]]
+  ; CHECK8-NEXT: %[[#S_OUT:]]       = or i[[#SBITS]] %[[#WS+7]], %[[#ARG]]
+
   ; CHECK-NEXT:   %a = load i64, i64* %p
   ; CHECK-NEXT:   store i[[#SBITS]] %[[#S_OUT]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
   ; CHECK-NEXT:   ret i64 %a
@@ -142,6 +153,16 @@
   ; CHECK16-NEXT: %[[#WS+5]]    = trunc i64 %[[#WS+4]] to i[[#SBITS]]
   ; CHECK16-NEXT: %[[#S_OUT:]]  = or i[[#SBITS]] %[[#WS+5]], %[[#ARG]]
 
+  ; COMM: On fast8, we need to OR 2x64bits for the wide shadow, before ORing its bytes (one more shift).
+  ; CHECK8-NEXT: %[[#WS+1]]     = lshr i64 %[[#WS]], 32
+  ; CHECK8-NEXT: %[[#WS+2]]     = or i64 %[[#WS]], %[[#WS+1]]
+  ; CHECK8-NEXT: %[[#WS+3]]     = lshr i64 %[[#WS+2]], 16
+  ; CHECK8-NEXT: %[[#WS+4]]     = or i64 %[[#WS+2]], %[[#WS+3]]
+  ; CHECK8-NEXT: %[[#WS+5]]     = lshr i64 %[[#WS+4]], 8
+  ; CHECK8-NEXT: %[[#WS+6]]     = or i64 %[[#WS+4]], %[[#WS+5]]
+  ; CHECK8-NEXT: %[[#WS+7]]     = trunc i64 %[[#WS+6]] to i[[#SBITS]]
+  ; CHECK8-NEXT: %[[#S_OUT:]]   = or i[[#SBITS]] %[[#WS+7]], %[[#ARG]]
+
   ; CHECK-NEXT: %a = load i128, i128* %p
   ; CHECK-NEXT: store i[[#SBITS]] %[[#S_OUT]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
   ; CHECK-NEXT: ret i128 %a
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/phi.ll b/llvm/test/Instrumentation/DataFlowSanitizer/phi.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/phi.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/phi.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,LEGACY
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -11,7 +12,7 @@
   ; LEGACY: [[PL:%.*]] = phi i[[#SBITS]] [ [[AL]], %T ], [ [[AL]], %F ]
   ; LEGACY: store i[[#SBITS]] [[PL]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
 
-  ; FAST: [[AL:%.*]] = load { [[ST:i[0-9]+]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([100 x i64]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]]
+  ; FAST: [[AL:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([100 x i64]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]]
   ; FAST: [[AL0:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } [[AL]], i[[#SBITS]] 0, 0
   ; FAST: [[AL1:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } [[AL]], i[[#SBITS]] 0, 1
   ; FAST: [[PL:%.*]] = phi { i[[#SBITS]], i[[#SBITS]] } [ [[AL0]], %T ], [ [[AL1]], %F ]
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/select.ll b/llvm/test/Instrumentation/DataFlowSanitizer/select.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/select.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/select.ll
@@ -2,6 +2,8 @@
 ; RUN: opt < %s -dfsan -dfsan-track-select-control-flow=0 -S | FileCheck %s --check-prefixes=CHECK,NO_TRACK_CF,NO_TRACK_CF_LEGACY
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -dfsan-track-select-control-flow=1 -S | FileCheck %s --check-prefixes=CHECK,TRACK_CF,TRACK_CF_FAST
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -dfsan-track-select-control-flow=0 -S | FileCheck %s --check-prefixes=CHECK,NO_TRACK_CF,NO_TRACK_CF_FAST
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels -dfsan-track-select-control-flow=1 -S | FileCheck %s --check-prefixes=CHECK,TRACK_CF,TRACK_CF_FAST
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels -dfsan-track-select-control-flow=0 -S | FileCheck %s --check-prefixes=CHECK,NO_TRACK_CF,NO_TRACK_CF_FAST
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll b/llvm/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu < %s -dfsan -S --dfsan-abilist=%S/Inputs/shadow-args-abilist.txt | FileCheck %s
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu < %s -dfsan -S --dfsan-abilist=%S/Inputs/shadow-args-abilist.txt -dfsan-fast-16-labels | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu < %s -dfsan -S --dfsan-abilist=%S/Inputs/shadow-args-abilist.txt -dfsan-fast-8-labels | FileCheck %s
 
 ; REQUIRES: x86-registered-target
 
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/store.ll b/llvm/test/Instrumentation/DataFlowSanitizer/store.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/store.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/store.ll
@@ -2,6 +2,8 @@
 ; RUN: opt < %s -dfsan -dfsan-combine-pointer-labels-on-store=0 -S | FileCheck %s --check-prefixes=CHECK,CHECK16,NO_COMBINE_PTR_LABEL
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -dfsan-combine-pointer-labels-on-store=1 -S | FileCheck %s --check-prefixes=CHECK,CHECK16,COMBINE_PTR_LABEL_FAST
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -dfsan-combine-pointer-labels-on-store=0 -S | FileCheck %s --check-prefixes=CHECK,CHECK16,NO_COMBINE_PTR_LABEL
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels -dfsan-combine-pointer-labels-on-store=1 -S | FileCheck %s --check-prefixes=CHECK,COMBINE_PTR_LABEL_FAST
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels -dfsan-combine-pointer-labels-on-store=0 -S | FileCheck %s --check-prefixes=CHECK,NO_COMBINE_PTR_LABEL
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/struct.ll b/llvm/test/Instrumentation/DataFlowSanitizer/struct.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/struct.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/struct.ll
@@ -1,11 +1,16 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,LEGACY
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-event-callbacks=true -S | FileCheck %s --check-prefixes=CHECK,EVENT_CALLBACKS
 ; RUN: opt < %s -dfsan -dfsan-args-abi -S | FileCheck %s --check-prefixes=CHECK,ARGS_ABI
-; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST16
+; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-combine-pointer-labels-on-load=false -S | FileCheck %s --check-prefixes=CHECK,NO_COMBINE_LOAD_PTR
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-combine-pointer-labels-on-store=true -S | FileCheck %s --check-prefixes=CHECK,COMBINE_STORE_PTR
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-track-select-control-flow=false -S | FileCheck %s --check-prefixes=CHECK,NO_SELECT_CONTROL
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-debug-nonzero-labels -S | FileCheck %s --check-prefixes=CHECK,DEBUG_NONZERO_LABELS
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-combine-pointer-labels-on-load=false -S | FileCheck %s --check-prefixes=CHECK,NO_COMBINE_LOAD_PTR
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-combine-pointer-labels-on-store=true -S | FileCheck %s --check-prefixes=CHECK,COMBINE_STORE_PTR
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-track-select-control-flow=false -S | FileCheck %s --check-prefixes=CHECK,NO_SELECT_CONTROL
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-debug-nonzero-labels -S | FileCheck %s --check-prefixes=CHECK,DEBUG_NONZERO_LABELS
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -73,18 +78,18 @@
   ; NO_SELECT_CONTROL: [[S:%.*]] = select i1 %c, { i[[#SBITS]], i[[#SBITS]] } [[A]], { i[[#SBITS]], i[[#SBITS]] } [[B]]
   ; NO_SELECT_CONTROL: store { i[[#SBITS]], i[[#SBITS]] } [[S]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
 
-  ; FAST16: @"dfs$select_struct"
-  ; FAST16: %[[#R:]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES) + 2]]) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]]
-  ; FAST16: %[[#R+1]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
-  ; FAST16: %[[#R+2]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
-  ; FAST16: %[[#R+3]] = select i1 %c, { i[[#SBITS]], i[[#SBITS]] } %[[#R+1]], { i[[#SBITS]], i[[#SBITS]] } %[[#R]]
-  ; FAST16: %[[#R+4]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } %[[#R+3]], 0
-  ; FAST16: %[[#R+5]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } %[[#R+3]], 1
-  ; FAST16: %[[#R+6]] = or i[[#SBITS]] %[[#R+4]], %[[#R+5]]
-  ; FAST16: %[[#R+7]] = or i[[#SBITS]] %[[#R+2]], %[[#R+6]]
-  ; FAST16: %[[#R+8]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } undef, i[[#SBITS]] %[[#R+7]], 0
-  ; FAST16: %[[#R+9]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } %[[#R+8]], i[[#SBITS]] %[[#R+7]], 1
-  ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } %[[#R+9]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
+  ; FAST: @"dfs$select_struct"
+  ; FAST: %[[#R:]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES) + 2]]) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]]
+  ; FAST: %[[#R+1]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
+  ; FAST: %[[#R+2]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; FAST: %[[#R+3]] = select i1 %c, { i[[#SBITS]], i[[#SBITS]] } %[[#R+1]], { i[[#SBITS]], i[[#SBITS]] } %[[#R]]
+  ; FAST: %[[#R+4]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } %[[#R+3]], 0
+  ; FAST: %[[#R+5]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } %[[#R+3]], 1
+  ; FAST: %[[#R+6]] = or i[[#SBITS]] %[[#R+4]], %[[#R+5]]
+  ; FAST: %[[#R+7]] = or i[[#SBITS]] %[[#R+2]], %[[#R+6]]
+  ; FAST: %[[#R+8]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } undef, i[[#SBITS]] %[[#R+7]], 0
+  ; FAST: %[[#R+9]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } %[[#R+8]], i[[#SBITS]] %[[#R+7]], 1
+  ; FAST: store { i[[#SBITS]], i[[#SBITS]] } %[[#R+9]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
 
   ; LEGACY: @"dfs$select_struct"
   ; LEGACY: [[U:%.*]] = call zeroext i[[#SBITS]] @__dfsan_union
@@ -96,13 +101,13 @@
 }
 
 define { i32, i32 } @asm_struct(i32 %0, i32 %1) {
-  ; FAST16: @"dfs$asm_struct"
-  ; FAST16: [[E1:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN:2]]
-  ; FAST16: [[E0:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
-  ; FAST16: [[E01:%.*]] = or i[[#SBITS]] [[E0]], [[E1]]
-  ; FAST16: [[S0:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } undef, i[[#SBITS]] [[E01]], 0
-  ; FAST16: [[S1:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } [[S0]], i[[#SBITS]] [[E01]], 1
-  ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } [[S1]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
+  ; FAST: @"dfs$asm_struct"
+  ; FAST: [[E1:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN:2]]
+  ; FAST: [[E0:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; FAST: [[E01:%.*]] = or i[[#SBITS]] [[E0]], [[E1]]
+  ; FAST: [[S0:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } undef, i[[#SBITS]] [[E01]], 0
+  ; FAST: [[S1:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } [[S0]], i[[#SBITS]] [[E01]], 1
+  ; FAST: store { i[[#SBITS]], i[[#SBITS]] } [[S1]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
 
   ; LEGACY: @"dfs$asm_struct"
   ; LEGACY: [[E1:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN:2]]
@@ -117,8 +122,8 @@
 }
 
 define {i32, i32} @const_struct() {
-  ; FAST16: @"dfs$const_struct"
-  ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } zeroinitializer, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align 2
+  ; FAST: @"dfs$const_struct"
+  ; FAST: store { i[[#SBITS]], i[[#SBITS]] } zeroinitializer, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align 2
 
   ; LEGACY: @"dfs$const_struct"
   ; LEGACY: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2
@@ -126,10 +131,10 @@
 }
 
 define i1 @extract_struct({i1, i5} %s) {
-  ; FAST16: @"dfs$extract_struct"
-  ; FAST16: [[SM:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]]
-  ; FAST16: [[EM:%.*]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } [[SM]], 0
-  ; FAST16: store i[[#SBITS]] [[EM]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; FAST: @"dfs$extract_struct"
+  ; FAST: [[SM:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]]
+  ; FAST: [[EM:%.*]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } [[SM]], 0
+  ; FAST: store i[[#SBITS]] [[EM]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
 
   ; LEGACY: @"dfs$extract_struct"
   ; LEGACY: [[SM:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]]
@@ -139,11 +144,11 @@
 }
 
 define {i1, i5} @insert_struct({i1, i5} %s, i5 %e1) {
-  ; FAST16: @"dfs$insert_struct"
-  ; FAST16: [[EM:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES)]]) to i[[#SBITS]]*), align [[ALIGN:2]]
-  ; FAST16: [[SM:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
-  ; FAST16: [[SM1:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } [[SM]], i[[#SBITS]] [[EM]], 1
-  ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } [[SM1]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
+  ; FAST: @"dfs$insert_struct"
+  ; FAST: [[EM:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES)]]) to i[[#SBITS]]*), align [[ALIGN:2]]
+  ; FAST: [[SM:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
+  ; FAST: [[SM1:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } [[SM]], i[[#SBITS]] [[EM]], 1
+  ; FAST: store { i[[#SBITS]], i[[#SBITS]] } [[SM1]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
 
   ; LEGACY: @"dfs$insert_struct"
   ; LEGACY: [[EM:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN:2]]
@@ -173,15 +178,15 @@
 }
 
 define void @store_struct({i1, i1}* %p, {i1, i1} %s) {
-  ; FAST16: @"dfs$store_struct"
-  ; FAST16: [[S:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]]
-  ; FAST16: [[E0:%.*]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } [[S]], 0
-  ; FAST16: [[E1:%.*]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } [[S]], 1
-  ; FAST16: [[E:%.*]] = or i[[#SBITS]] [[E0]], [[E1]]
-  ; FAST16: [[P0:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[P:%.*]], i32 0
-  ; FAST16: store i[[#SBITS]] [[E]], i[[#SBITS]]* [[P0]], align [[#SBYTES]]
-  ; FAST16: [[P1:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[P]], i32 1
-  ; FAST16: store i[[#SBITS]] [[E]], i[[#SBITS]]* [[P1]], align [[#SBYTES]]
+  ; FAST: @"dfs$store_struct"
+  ; FAST: [[S:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]]
+  ; FAST: [[E0:%.*]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } [[S]], 0
+  ; FAST: [[E1:%.*]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } [[S]], 1
+  ; FAST: [[E:%.*]] = or i[[#SBITS]] [[E0]], [[E1]]
+  ; FAST: [[P0:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[P:%.*]], i32 0
+  ; FAST: store i[[#SBITS]] [[E]], i[[#SBITS]]* [[P0]], align [[#SBYTES]]
+  ; FAST: [[P1:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[P]], i32 1
+  ; FAST: store i[[#SBITS]] [[E]], i[[#SBITS]]* [[P1]], align [[#SBYTES]]
 
   ; EVENT_CALLBACKS: @"dfs$store_struct"
   ; EVENT_CALLBACKS: [[OL:%.*]] = or i[[#SBITS]]
@@ -204,68 +209,68 @@
 }
 
 define i2 @extract_struct_of_aggregate11(%StructOfAggr %s) {
-  ; FAST16: @"dfs$extract_struct_of_aggregate11"
-  ; FAST16: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]]
-  ; FAST16: [[E11:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 1, 1
-  ; FAST16: store i[[#SBITS]] [[E11]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; FAST: @"dfs$extract_struct_of_aggregate11"
+  ; FAST: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]]
+  ; FAST: [[E11:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 1, 1
+  ; FAST: store i[[#SBITS]] [[E11]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
 
   %e11 = extractvalue %StructOfAggr %s, 1, 1
   ret i2 %e11
 }
 
 define [4 x i2] @extract_struct_of_aggregate1(%StructOfAggr %s) {
-  ; FAST16: @"dfs$extract_struct_of_aggregate1"
-  ; FAST16: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]]
-  ; FAST16: [[E1:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 1
-  ; FAST16: store [4 x i[[#SBITS]]] [[E1]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]]
+  ; FAST: @"dfs$extract_struct_of_aggregate1"
+  ; FAST: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]]
+  ; FAST: [[E1:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 1
+  ; FAST: store [4 x i[[#SBITS]]] [[E1]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]]
   %e1 = extractvalue %StructOfAggr %s, 1
   ret [4 x i2] %e1
 }
 
 define <4 x i3> @extract_struct_of_aggregate2(%StructOfAggr %s) {
-  ; FAST16: @"dfs$extract_struct_of_aggregate2"
-  ; FAST16: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]]
-  ; FAST16: [[E2:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 2
-  ; FAST16: store i[[#SBITS]] [[E2]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; FAST: @"dfs$extract_struct_of_aggregate2"
+  ; FAST: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]]
+  ; FAST: [[E2:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 2
+  ; FAST: store i[[#SBITS]] [[E2]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
   %e2 = extractvalue %StructOfAggr %s, 2
   ret <4 x i3> %e2
 }
 
 define { i1, i1 } @extract_struct_of_aggregate3(%StructOfAggr %s) {
-  ; FAST16: @"dfs$extract_struct_of_aggregate3"
-  ; FAST16: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]]
-  ; FAST16: [[E3:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 3
-  ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } [[E3]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
+  ; FAST: @"dfs$extract_struct_of_aggregate3"
+  ; FAST: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]]
+  ; FAST: [[E3:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 3
+  ; FAST: store { i[[#SBITS]], i[[#SBITS]] } [[E3]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
   %e3 = extractvalue %StructOfAggr %s, 3
   ret { i1, i1 } %e3
 }
 
 define i1 @extract_struct_of_aggregate31(%StructOfAggr %s) {
-  ; FAST16: @"dfs$extract_struct_of_aggregate31"
-  ; FAST16: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]]
-  ; FAST16: [[E31:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 3, 1
-  ; FAST16: store i[[#SBITS]] [[E31]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; FAST: @"dfs$extract_struct_of_aggregate31"
+  ; FAST: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]]
+  ; FAST: [[E31:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 3, 1
+  ; FAST: store i[[#SBITS]] [[E31]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
   %e31 = extractvalue %StructOfAggr %s, 3, 1
   ret i1 %e31
 }
 
 define %StructOfAggr @insert_struct_of_aggregate11(%StructOfAggr %s, i2 %e11) {
-  ; FAST16: @"dfs$insert_struct_of_aggregate11"
-  ; FAST16: [[E11:%.*]]  = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(8, SBYTES)]]) to i[[#SBITS]]*), align [[ALIGN:2]]
-  ; FAST16: [[S:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]]
-  ; FAST16: [[S1:%.*]] = insertvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[S]], i[[#SBITS]] [[E11]], 1, 1
-  ; FAST16: store { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[S1]], { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]]
+  ; FAST: @"dfs$insert_struct_of_aggregate11"
+  ; FAST: [[E11:%.*]]  = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(8, SBYTES)]]) to i[[#SBITS]]*), align [[ALIGN:2]]
+  ; FAST: [[S:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]]
+  ; FAST: [[S1:%.*]] = insertvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[S]], i[[#SBITS]] [[E11]], 1, 1
+  ; FAST: store { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[S1]], { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]]
 
   %s1 = insertvalue %StructOfAggr %s, i2 %e11, 1, 1
   ret %StructOfAggr %s1
 }
 
 define {i8*, i32} @call_struct({i8*, i32} %s) {
-  ; FAST16: @"dfs$call_struct"
-  ; FAST16: [[S:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]]
-  ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } [[S]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
-  ; FAST16: %_dfsret = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
-  ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } %_dfsret, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
+  ; FAST: @"dfs$call_struct"
+  ; FAST: [[S:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]]
+  ; FAST: store { i[[#SBITS]], i[[#SBITS]] } [[S]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
+  ; FAST: %_dfsret = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
+  ; FAST: store { i[[#SBITS]], i[[#SBITS]] } %_dfsret, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
 
   %r = call {i8*, i32} @pass_struct({i8*, i32} %s)
   ret {i8*, i32} %r
@@ -274,15 +279,15 @@
 declare %StructOfAggr @fun_with_many_aggr_args(<2 x i7> %v, [2 x i5] %a, {i3, i3} %s)
 
 define %StructOfAggr @call_many_aggr_args(<2 x i7> %v, [2 x i5] %a, {i3, i3} %s) {
-  ; FAST16: @"dfs$call_many_aggr_args"
-  ; FAST16: [[S:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES) + 2]]) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]]
-  ; FAST16: [[A:%.*]] = load [2 x i[[#SBITS]]], [2 x i[[#SBITS]]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to [2 x i[[#SBITS]]]*), align [[ALIGN]]
-  ; FAST16: [[V:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
-  ; FAST16: store i[[#SBITS]] [[V]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
-  ; FAST16: store [2 x i[[#SBITS]]] [[A]], [2 x i[[#SBITS]]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to [2 x i[[#SBITS]]]*), align [[ALIGN]]
-  ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } [[S]], { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES) + 2]]) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
-  ; FAST16: %_dfsret = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]]
-  ; FAST16: store { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } %_dfsret, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]]
+  ; FAST: @"dfs$call_many_aggr_args"
+  ; FAST: [[S:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES) + 2]]) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]]
+  ; FAST: [[A:%.*]] = load [2 x i[[#SBITS]]], [2 x i[[#SBITS]]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to [2 x i[[#SBITS]]]*), align [[ALIGN]]
+  ; FAST: [[V:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; FAST: store i[[#SBITS]] [[V]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; FAST: store [2 x i[[#SBITS]]] [[A]], [2 x i[[#SBITS]]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to [2 x i[[#SBITS]]]*), align [[ALIGN]]
+  ; FAST: store { i[[#SBITS]], i[[#SBITS]] } [[S]], { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES) + 2]]) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]]
+  ; FAST: %_dfsret = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]]
+  ; FAST: store { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } %_dfsret, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]]
 
   %r = call %StructOfAggr @fun_with_many_aggr_args(<2 x i7> %v, [2 x i5] %a, {i3, i3} %s)
   ret %StructOfAggr %r
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/vector.ll b/llvm/test/Instrumentation/DataFlowSanitizer/vector.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/vector.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/vector.ll
@@ -1,6 +1,7 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,TLS_ABI,TLS_ABI_LEGACY
 ; RUN: opt < %s -dfsan -dfsan-args-abi -S | FileCheck %s --check-prefixes=CHECK,ARGS_ABI
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,TLS_ABI,TLS_ABI_FAST
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK,TLS_ABI,TLS_ABI_FAST
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"