diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -200,6 +200,14 @@
              "labels to 16."),
     cl::Hidden, cl::init(false));
 
+// Use a distinct bit for each base label, enabling faster unions with less
+// instrumentation.  Limits the max number of base labels to 8.
+static cl::opt<bool> ClFast8Labels(
+    "dfsan-fast-8-labels",
+    cl::desc("Use more efficient instrumentation, limiting the number of "
+             "labels to 8."),
+    cl::Hidden, cl::init(false));
+
 // Controls whether the pass tracks the control flow of select instructions.
 static cl::opt<bool> ClTrackSelectControlFlow(
     "dfsan-track-select-control-flow",
@@ -341,8 +349,6 @@
   friend class DFSanVisitor;
 
   enum {
-    ShadowWidthBits = 16,
-    ShadowWidthBytes = ShadowWidthBits / 8,
     OriginWidthBits = 32,
     OriginWidthBytes = OriginWidthBits / 8
   };
@@ -383,6 +389,9 @@
     WK_Custom
   };
 
+  unsigned ShadowWidthBits;
+  unsigned ShadowWidthBytes;
+
   Module *Mod;
   LLVMContext *Ctx;
   Type *Int8Ptr;
@@ -419,7 +428,7 @@
   FunctionCallee DFSanUnionFn;
   FunctionCallee DFSanCheckedUnionFn;
   FunctionCallee DFSanUnionLoadFn;
-  FunctionCallee DFSanUnionLoadFast16LabelsFn;
+  FunctionCallee DFSanUnionLoadFastLabelsFn;
   FunctionCallee DFSanLoadLabelAndOriginFn;
   FunctionCallee DFSanUnimplementedFn;
   FunctionCallee DFSanSetLabelFn;
@@ -442,6 +451,7 @@
 
   Value *getShadowOffset(Value *Addr, IRBuilder<> &IRB);
   Value *getShadowAddress(Value *Addr, Instruction *Pos);
+  Value *getShadowAddress(Value *Addr, Instruction *Pos, Value *ShadowOffset);
   std::pair<Value *, Value *>
   getShadowOriginAddress(Value *Addr, Align InstAlignment, Instruction *Pos);
   bool isInstrumented(const Function *F);
@@ -733,6 +743,14 @@
 
 DataFlowSanitizer::DataFlowSanitizer(
     const std::vector<std::string> &ABIListFiles) {
+  if (ClFast8Labels && ClFast16Labels) {
+    report_fatal_error(
+        "cannot set both -dfsan-fast-8-labels and -dfsan-fast-16-labels");
+  }
+
+  ShadowWidthBits = ClFast8Labels ? 8 : 16;
+  ShadowWidthBytes = ShadowWidthBits / 8;
+
   std::vector<std::string> AllABIListFiles(std::move(ABIListFiles));
   llvm::append_range(AllABIListFiles, ClABIListFiles);
   // FIXME: should we propagate vfs::FileSystem to this constructor?
@@ -835,7 +853,9 @@
 }
 
 bool DataFlowSanitizer::shouldTrackFieldsAndIndices() {
-  return getInstrumentedABI() == DataFlowSanitizer::IA_TLS && ClFast16Labels;
+  if (getInstrumentedABI() != DataFlowSanitizer::IA_TLS)
+    return false;
+  return ClFast8Labels || ClFast16Labels;
 }
 
 Constant *DataFlowSanitizer::getZeroShadow(Type *OrigTy) {
@@ -1000,11 +1020,15 @@
 
   switch (TargetTriple.getArch()) {
   case Triple::x86_64:
-    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
+    ShadowPtrMask = ClFast8Labels
+                        ? ConstantInt::getSigned(IntptrTy, ~0x600000000000LL)
+                        : ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
     break;
   case Triple::mips64:
   case Triple::mips64el:
-    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL);
+    ShadowPtrMask = ClFast8Labels
+                        ? ConstantInt::getSigned(IntptrTy, ~0xE000000000LL)
+                        : ConstantInt::getSigned(IntptrTy, ~0xF000000000LL);
     break;
   case Triple::aarch64:
   case Triple::aarch64_be:
@@ -1238,7 +1262,7 @@
                          Attribute::ReadOnly);
     AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
                          Attribute::ZExt);
-    DFSanUnionLoadFast16LabelsFn = Mod->getOrInsertFunction(
+    DFSanUnionLoadFastLabelsFn = Mod->getOrInsertFunction(
         "__dfsan_union_load_fast16labels", DFSanUnionLoadFnTy, AL);
   }
   {
@@ -1290,7 +1314,7 @@
   DFSanRuntimeFunctions.insert(
       DFSanUnionLoadFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
-      DFSanUnionLoadFast16LabelsFn.getCallee()->stripPointerCasts());
+      DFSanUnionLoadFastLabelsFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
       DFSanLoadLabelAndOriginFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
@@ -1757,8 +1781,7 @@
   // Returns ((Addr & shadow_mask) + origin_base) & ~4UL
   IRBuilder<> IRB(Pos);
   Value *ShadowOffset = getShadowOffset(Addr, IRB);
-  Value *ShadowPtr = IRB.CreateIntToPtr(
-      IRB.CreateMul(ShadowOffset, ShadowPtrMul), PrimitiveShadowPtrTy);
+  Value *ShadowPtr = getShadowAddress(Addr, Pos, ShadowOffset);
   Value *OriginPtr = nullptr;
   if (shouldTrackOrigins()) {
     Value *OriginLong = IRB.CreateAdd(ShadowOffset, OriginBase);
@@ -1774,12 +1797,21 @@
   return {ShadowPtr, OriginPtr};
 }
 
+Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos,
+                                           Value *ShadowOffset) {
+  IRBuilder<> IRB(Pos);
+
+  if (!ShadowPtrMul->isOne())
+    ShadowOffset = IRB.CreateMul(ShadowOffset, ShadowPtrMul);
+
+  return IRB.CreateIntToPtr(ShadowOffset, PrimitiveShadowPtrTy);
+}
+
 Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) {
   // Returns (Addr & shadow_mask) x 2
   IRBuilder<> IRB(Pos);
   Value *ShadowOffset = getShadowOffset(Addr, IRB);
-  return IRB.CreateIntToPtr(IRB.CreateMul(ShadowOffset, ShadowPtrMul),
-                            PrimitiveShadowPtrTy);
+  return getShadowAddress(Addr, Pos, ShadowOffset);
 }
 
 Value *DFSanFunction::combineShadowsThenConvert(Type *T, Value *V1, Value *V2,
@@ -1829,7 +1861,7 @@
   Value *PV2 = collapseToPrimitiveShadow(V2, Pos);
 
   IRBuilder<> IRB(Pos);
-  if (ClFast16Labels) {
+  if (ClFast8Labels || ClFast16Labels) {
     CCS.Block = Pos->getParent();
     CCS.Shadow = IRB.CreateOr(PV1, PV2);
   } else if (AvoidNewBlocks) {
@@ -1978,27 +2010,37 @@
 std::pair<Value *, Value *> DFSanFunction::loadFast16ShadowFast(
     Value *ShadowAddr, Value *OriginAddr, uint64_t Size, Align ShadowAlign,
     Align OriginAlign, Value *FirstOrigin, Instruction *Pos) {
-  // First OR all the WideShadows, then OR individual shadows within the
-  // combined WideShadow. This is fewer instructions than ORing shadows
-  // individually.
+  // First OR all the WideShadows (i.e., 64bit or 32bit shadow chunks) linearly;
+  // then OR individual shadows within the combined WideShadow by binary ORing.
+  // This is fewer instructions than ORing shadows individually, since it
+  // needs logN shift/or instructions (N being the bytes of the combined wide
+  // shadow).
   const bool ShouldTrackOrigins = DFS.shouldTrackOrigins();
   std::vector<Value *> Shadows;
   std::vector<Value *> Origins;
+
   IRBuilder<> IRB(Pos);
-  Value *WideAddr =
-      IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
+  uint64_t ShadowSize = Size * DFS.ShadowWidthBytes;
+  Type *WideShadowTy =
+      ShadowSize <= 4 ? Type::getInt32Ty(*DFS.Ctx) : Type::getInt64Ty(*DFS.Ctx);
+  Value *WideAddr = IRB.CreateBitCast(ShadowAddr, WideShadowTy->getPointerTo());
   Value *CombinedWideShadow =
-      IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
+      IRB.CreateAlignedLoad(WideShadowTy, WideAddr, ShadowAlign);
+
   if (ShouldTrackOrigins) {
     Shadows.push_back(CombinedWideShadow);
     Origins.push_back(FirstOrigin);
   }
-  for (uint64_t Ofs = 64 / DFS.ShadowWidthBits; Ofs != Size;
-       Ofs += 64 / DFS.ShadowWidthBits) {
-    WideAddr = IRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr,
+
+  unsigned WideShadowBitWidth = WideShadowTy->getIntegerBitWidth();
+  const uint64_t BytesPerWideShadow = WideShadowBitWidth / DFS.ShadowWidthBits;
+
+  for (uint64_t ByteOfs = BytesPerWideShadow; ByteOfs < Size;
+       ByteOfs += BytesPerWideShadow) {
+    WideAddr = IRB.CreateGEP(WideShadowTy, WideAddr,
                              ConstantInt::get(DFS.IntptrTy, 1));
     Value *NextWideShadow =
-        IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
+        IRB.CreateAlignedLoad(WideShadowTy, WideAddr, ShadowAlign);
     CombinedWideShadow = IRB.CreateOr(CombinedWideShadow, NextWideShadow);
     if (ShouldTrackOrigins) {
       Shadows.push_back(NextWideShadow);
@@ -2008,7 +2050,8 @@
           IRB.CreateAlignedLoad(DFS.OriginTy, OriginAddr, OriginAlign));
     }
   }
-  for (unsigned Width = 32; Width >= DFS.ShadowWidthBits; Width >>= 1) {
+  for (unsigned Width = WideShadowBitWidth / 2; Width >= DFS.ShadowWidthBits;
+       Width >>= 1) {
     Value *ShrShadow = IRB.CreateLShr(CombinedWideShadow, Width);
     CombinedWideShadow = IRB.CreateOr(CombinedWideShadow, ShrShadow);
   }
@@ -2031,16 +2074,21 @@
       DFS.DFSanUnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
   FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
 
+  uint64_t ShadowSize = Size * DFS.ShadowWidthBytes;
+  Type *WideShadowTy =
+      ShadowSize <= 4 ? Type::getInt32Ty(*DFS.Ctx) : Type::getInt64Ty(*DFS.Ctx);
+
   // Compare each of the shadows stored in the loaded 64 bits to each other,
   // by computing (WideShadow rotl ShadowWidthBits) == WideShadow.
   IRBuilder<> IRB(Pos);
-  Value *WideAddr =
-      IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
+  unsigned WideShadowBitWidth = WideShadowTy->getIntegerBitWidth();
+  Value *WideAddr = IRB.CreateBitCast(ShadowAddr, WideShadowTy->getPointerTo());
   Value *WideShadow =
-      IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
+      IRB.CreateAlignedLoad(WideShadowTy, WideAddr, ShadowAlign);
   Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.PrimitiveShadowTy);
   Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidthBits);
-  Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidthBits);
+  Value *ShrShadow =
+      IRB.CreateLShr(WideShadow, WideShadowBitWidth - DFS.ShadowWidthBits);
   Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow);
   Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow);
 
@@ -2063,15 +2111,17 @@
   ReplaceInstWithInst(Head->getTerminator(), LastBr);
   DT.addNewBlock(FallbackBB, Head);
 
-  for (uint64_t Ofs = 64 / DFS.ShadowWidthBits; Ofs != Size;
-       Ofs += 64 / DFS.ShadowWidthBits) {
+  const uint64_t BytesPerWideShadow = WideShadowBitWidth / DFS.ShadowWidthBits;
+
+  for (uint64_t ByteOfs = BytesPerWideShadow; ByteOfs < Size;
+       ByteOfs += BytesPerWideShadow) {
     BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F);
     DT.addNewBlock(NextBB, LastBr->getParent());
     IRBuilder<> NextIRB(NextBB);
-    WideAddr = NextIRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr,
+    WideAddr = NextIRB.CreateGEP(WideShadowTy, WideAddr,
                                  ConstantInt::get(DFS.IntptrTy, 1));
     Value *NextWideShadow =
-        NextIRB.CreateAlignedLoad(NextIRB.getInt64Ty(), WideAddr, ShadowAlign);
+        NextIRB.CreateAlignedLoad(WideShadowTy, WideAddr, ShadowAlign);
     ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow);
     LastBr->setSuccessor(0, NextBB);
     LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB);
@@ -2158,6 +2208,7 @@
     Origin = IRB.CreateAlignedLoad(DFS.OriginTy, OriginAddr, OriginAlign);
   }
 
+  // When the byte size is small enough, we optimize the generated instructions.
   switch (Size) {
   case 1: {
     LoadInst *LI = new LoadInst(DFS.PrimitiveShadowTy, ShadowAddr, "", Pos);
@@ -2175,17 +2226,21 @@
     return {combineShadows(Load, Load1, Pos), Origin};
   }
   }
+  uint64_t ShadowSize = Size * DFS.ShadowWidthBytes;
+  bool HasSizeForFastPath = ShadowSize % 8 == 0 || ShadowSize == 4;
+  bool HasFastLabelsEnabled = ClFast8Labels || ClFast16Labels;
 
-  if (ClFast16Labels && Size % (64 / DFS.ShadowWidthBits) == 0)
+  if (HasFastLabelsEnabled && HasSizeForFastPath)
     return loadFast16ShadowFast(ShadowAddr, OriginAddr, Size, ShadowAlign,
                                 OriginAlign, Origin, Pos);
 
-  if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidthBits) == 0)
+  if (!AvoidNewBlocks && HasSizeForFastPath)
     return {loadLegacyShadowFast(ShadowAddr, Size, ShadowAlign, Pos), Origin};
 
   IRBuilder<> IRB(Pos);
-  FunctionCallee &UnionLoadFn =
-      ClFast16Labels ? DFS.DFSanUnionLoadFast16LabelsFn : DFS.DFSanUnionLoadFn;
+  FunctionCallee &UnionLoadFn = HasFastLabelsEnabled
+                                    ? DFS.DFSanUnionLoadFastLabelsFn
+                                    : DFS.DFSanUnionLoadFn;
   CallInst *FallbackCall = IRB.CreateCall(
       UnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
   FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
@@ -2406,10 +2461,11 @@
   std::tie(ShadowAddr, OriginAddr) =
       DFS.getShadowOriginAddress(Addr, InstAlignment, Pos);
 
-  const unsigned ShadowVecSize = 128 / DFS.ShadowWidthBits;
+  const unsigned ShadowVecSize = 8;
   uint64_t Offset = 0;
   uint64_t LeftSize = Size;
   if (LeftSize >= ShadowVecSize) {
+    assert(ShadowVecSize * DFS.ShadowWidthBits <= 128);
     auto *ShadowVecTy =
         FixedVectorType::get(DFS.PrimitiveShadowTy, ShadowVecSize);
     Value *ShadowVec = UndefValue::get(ShadowVecTy);
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/abilist.ll b/llvm/test/Instrumentation/DataFlowSanitizer/abilist.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/abilist.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/abilist.ll
@@ -1,4 +1,6 @@
 ; RUN: opt < %s -dfsan -dfsan-args-abi -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s
+; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-args-abi -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-args-abi -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/abilist_aggregate.ll b/llvm/test/Instrumentation/DataFlowSanitizer/abilist_aggregate.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/abilist_aggregate.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/abilist_aggregate.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s --check-prefixes=CHECK,TLS_ABI
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s --check-prefixes=CHECK,TLS_ABI
 ; RUN: opt < %s -dfsan -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s --check-prefixes=CHECK,LEGACY
 ; RUN: opt < %s -dfsan -dfsan-args-abi -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s --check-prefixes=CHECK,ARGS_ABI
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/array.ll b/llvm/test/Instrumentation/DataFlowSanitizer/array.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/array.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/array.ll
@@ -1,10 +1,15 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,LEGACY
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-event-callbacks=true -S | FileCheck %s --check-prefixes=CHECK,EVENT_CALLBACKS
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-event-callbacks=true -S | FileCheck %s --check-prefixes=CHECK,EVENT_CALLBACKS
 ; RUN: opt < %s -dfsan -dfsan-args-abi -S | FileCheck %s --check-prefixes=CHECK,ARGS_ABI
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST16
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-combine-pointer-labels-on-load=false -S | FileCheck %s --check-prefixes=CHECK,NO_COMBINE_LOAD_PTR
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-combine-pointer-labels-on-store=true -S | FileCheck %s --check-prefixes=CHECK,COMBINE_STORE_PTR
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-debug-nonzero-labels -S | FileCheck %s --check-prefixes=CHECK,DEBUG_NONZERO_LABELS
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST16
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-combine-pointer-labels-on-load=false -S | FileCheck %s --check-prefixes=CHECK,NO_COMBINE_LOAD_PTR
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-combine-pointer-labels-on-store=true -S | FileCheck %s --check-prefixes=CHECK,COMBINE_STORE_PTR
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-debug-nonzero-labels -S | FileCheck %s --check-prefixes=CHECK,DEBUG_NONZERO_LABELS
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll b/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK16
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK
 ;
 ; The patterns about origins cannot be tested until the origin tracking feature is complete.
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll b/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,CHECK_NO_ORIGIN -DSHADOW_MASK=-123145302310913
 ; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK_ORIGIN -DSHADOW_MASK=-123145302310913
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels=true  -S | FileCheck %s --check-prefixes=CHECK,CHECK_NO_ORIGIN -DSHADOW_MASK=-105553116266497
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/call.ll b/llvm/test/Instrumentation/DataFlowSanitizer/call.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/call.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/call.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -S | FileCheck %s
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels -S | FileCheck %s
 ; RUN: opt < %s -passes=dfsan -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/external_mask.ll b/llvm/test/Instrumentation/DataFlowSanitizer/external_mask.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/external_mask.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/external_mask.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,CHECK16
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK16
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/fast16labels.ll b/llvm/test/Instrumentation/DataFlowSanitizer/fast16labels.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/fast16labels.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/fast16labels.ll
@@ -1,6 +1,7 @@
 ; Test that -dfsan-fast-16-labels mode uses inline ORs rather than calling
 ; __dfsan_union or __dfsan_union_load.
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -S | FileCheck %s --implicit-check-not="call{{.*}}__dfsan_union" --check-prefixes=CHECK,CHECK16
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels -S | FileCheck %s --implicit-check-not="call{{.*}}__dfsan_union" --check-prefixes=CHECK,CHECK8
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -13,7 +14,7 @@
   ; CHECK-LABEL: define i8 @"dfs$add"
   ; CHECK-DAG: %[[ALABEL:.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]]
   ; CHECK-DAG: %[[BLABEL:.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN]]
-  ; CHECK: %[[ADDLABEL:.*]] = or i16 %[[ALABEL]], %[[BLABEL]]
+  ; CHECK: %[[ADDLABEL:.*]] = or i[[#SBITS]] %[[ALABEL]], %[[BLABEL]]
   ; CHECK: %c = add i8 %a, %b
   ; CHECK: store i[[#SBITS]] %[[ADDLABEL]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
   ; CHECK: ret i8 %c
@@ -24,7 +25,7 @@
 define i8 @load8(i8* %p) {
   ; CHECK-LABEL:  define i8 @"dfs$load8"
   ; CHECK-SAME:   (i8* %[[PADDR:.*]])
-  ; CHECK-NEXT:   %[[#ARG:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i16*), align [[ALIGN]]
+  ; CHECK-NEXT:   %[[#ARG:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
   ; CHECK-NEXT:   %[[#R:]] = ptrtoint i8* %[[PADDR]] to i64
   ; CHECK-NEXT:   %[[#PS:R+1]] = and i64 %[[#R]], [[#%.10d,MASK:]]
   ; CHECK16-NEXT: %[[#PS:R+2]] = mul i64 %[[#R+1]], 2
@@ -106,6 +107,16 @@
   ; CHECK16-NEXT: %[[#WS+5]]        = trunc i64 %[[#WS+4]] to i[[#SBITS]]
   ; CHECK16-NEXT: %[[#S_OUT:]]      = or i[[#SBITS]] %[[#WS+5]], %[[#ARG]]
 
+  ; COMM: On fast8, no need to OR the wide shadow but one more shift is needed.
+  ; CHECK8-NEXT: %[[#WS+1]]         = lshr i64 %[[#WS]], 32
+  ; CHECK8-NEXT: %[[#WS+2]]         = or i64 %[[#WS]], %[[#WS+1]]
+  ; CHECK8-NEXT: %[[#WS+3]]         = lshr i64 %[[#WS+2]], 16
+  ; CHECK8-NEXT: %[[#WS+4]]         = or i64 %[[#WS+2]], %[[#WS+3]]
+  ; CHECK8-NEXT: %[[#WS+5]]         = lshr i64 %[[#WS+4]], 8
+  ; CHECK8-NEXT: %[[#WS+6]]         = or i64 %[[#WS+4]], %[[#WS+5]]
+  ; CHECK8-NEXT: %[[#WS+7]]         = trunc i64 %[[#WS+6]] to i[[#SBITS]]
+  ; CHECK8-NEXT: %[[#S_OUT:]]       = or i[[#SBITS]] %[[#WS+7]], %[[#ARG]]
+
   ; CHECK-NEXT:   %a = load i64, i64* %p
   ; CHECK-NEXT:   store i[[#SBITS]] %[[#S_OUT]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
   ; CHECK-NEXT:   ret i64 %a
@@ -142,6 +153,16 @@
   ; CHECK16-NEXT: %[[#WS+5]]    = trunc i64 %[[#WS+4]] to i[[#SBITS]]
   ; CHECK16-NEXT: %[[#S_OUT:]]  = or i[[#SBITS]] %[[#WS+5]], %[[#ARG]]
 
+  ; COMM: On fast8, we need to OR 2x64bits for the wide shadow, before ORing its bytes (one more shift).
+  ; CHECK8-NEXT: %[[#WS+1]]     = lshr i64 %[[#WS]], 32
+  ; CHECK8-NEXT: %[[#WS+2]]     = or i64 %[[#WS]], %[[#WS+1]]
+  ; CHECK8-NEXT: %[[#WS+3]]     = lshr i64 %[[#WS+2]], 16
+  ; CHECK8-NEXT: %[[#WS+4]]     = or i64 %[[#WS+2]], %[[#WS+3]]
+  ; CHECK8-NEXT: %[[#WS+5]]     = lshr i64 %[[#WS+4]], 8
+  ; CHECK8-NEXT: %[[#WS+6]]     = or i64 %[[#WS+4]], %[[#WS+5]]
+  ; CHECK8-NEXT: %[[#WS+7]]     = trunc i64 %[[#WS+6]] to i[[#SBITS]]
+  ; CHECK8-NEXT: %[[#S_OUT:]]   = or i[[#SBITS]] %[[#WS+7]], %[[#ARG]]
+
   ; CHECK-NEXT: %a = load i128, i128* %p
   ; CHECK-NEXT: store i[[#SBITS]] %[[#S_OUT]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
   ; CHECK-NEXT: ret i128 %a
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/phi.ll b/llvm/test/Instrumentation/DataFlowSanitizer/phi.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/phi.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/phi.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,LEGACY
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -11,7 +12,7 @@
   ; LEGACY: [[PL:%.*]] = phi i[[#SBITS]] [ [[AL]], %T ], [ [[AL]], %F ]
   ; LEGACY: store i[[#SBITS]] [[PL]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
 
-  ; FAST: [[AL:%.*]] = load { [[ST:i[0-9]+]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([100 x i64]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]]
+  ; FAST: [[AL:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([100 x i64]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]]
   ; FAST: [[AL0:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } [[AL]], i[[#SBITS]] 0, 0
   ; FAST: [[AL1:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } [[AL]], i[[#SBITS]] 0, 1
   ; FAST: [[PL:%.*]] = phi { i[[#SBITS]], i[[#SBITS]] } [ [[AL0]], %T ], [ [[AL1]], %F ]
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/select.ll b/llvm/test/Instrumentation/DataFlowSanitizer/select.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/select.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/select.ll
@@ -2,6 +2,8 @@
 ; RUN: opt < %s -dfsan -dfsan-track-select-control-flow=0 -S | FileCheck %s --check-prefixes=CHECK,NO_TRACK_CF,NO_TRACK_CF_LEGACY
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -dfsan-track-select-control-flow=1 -S | FileCheck %s --check-prefixes=CHECK,TRACK_CF,TRACK_CF_FAST
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -dfsan-track-select-control-flow=0 -S | FileCheck %s --check-prefixes=CHECK,NO_TRACK_CF,NO_TRACK_CF_FAST
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels -dfsan-track-select-control-flow=1 -S | FileCheck %s --check-prefixes=CHECK,TRACK_CF,TRACK_CF_FAST
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels -dfsan-track-select-control-flow=0 -S | FileCheck %s --check-prefixes=CHECK,NO_TRACK_CF,NO_TRACK_CF_FAST
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll b/llvm/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu < %s -dfsan -S --dfsan-abilist=%S/Inputs/shadow-args-abilist.txt | FileCheck %s
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu < %s -dfsan -S --dfsan-abilist=%S/Inputs/shadow-args-abilist.txt -dfsan-fast-16-labels | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu < %s -dfsan -S --dfsan-abilist=%S/Inputs/shadow-args-abilist.txt -dfsan-fast-8-labels | FileCheck %s
 
 ; REQUIRES: x86-registered-target
 
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/store.ll b/llvm/test/Instrumentation/DataFlowSanitizer/store.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/store.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/store.ll
@@ -2,6 +2,8 @@
 ; RUN: opt < %s -dfsan -dfsan-combine-pointer-labels-on-store=0 -S | FileCheck %s --check-prefixes=CHECK,CHECK16,NO_COMBINE_PTR_LABEL
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -dfsan-combine-pointer-labels-on-store=1 -S | FileCheck %s --check-prefixes=CHECK,CHECK16,COMBINE_PTR_LABEL_FAST
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -dfsan-combine-pointer-labels-on-store=0 -S | FileCheck %s --check-prefixes=CHECK,CHECK16,NO_COMBINE_PTR_LABEL
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels -dfsan-combine-pointer-labels-on-store=1 -S | FileCheck %s --check-prefixes=CHECK,COMBINE_PTR_LABEL_FAST
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels -dfsan-combine-pointer-labels-on-store=0 -S | FileCheck %s --check-prefixes=CHECK,NO_COMBINE_PTR_LABEL
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/struct.ll b/llvm/test/Instrumentation/DataFlowSanitizer/struct.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/struct.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/struct.ll
@@ -6,6 +6,11 @@
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-combine-pointer-labels-on-store=true -S | FileCheck %s --check-prefixes=CHECK,COMBINE_STORE_PTR
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-track-select-control-flow=false -S | FileCheck %s --check-prefixes=CHECK,NO_SELECT_CONTROL
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-debug-nonzero-labels -S | FileCheck %s --check-prefixes=CHECK,DEBUG_NONZERO_LABELS
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST16
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-combine-pointer-labels-on-load=false -S | FileCheck %s --check-prefixes=CHECK,NO_COMBINE_LOAD_PTR
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-combine-pointer-labels-on-store=true -S | FileCheck %s --check-prefixes=CHECK,COMBINE_STORE_PTR
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-track-select-control-flow=false -S | FileCheck %s --check-prefixes=CHECK,NO_SELECT_CONTROL
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-debug-nonzero-labels -S | FileCheck %s --check-prefixes=CHECK,DEBUG_NONZERO_LABELS
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/vector.ll b/llvm/test/Instrumentation/DataFlowSanitizer/vector.ll
--- a/llvm/test/Instrumentation/DataFlowSanitizer/vector.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/vector.ll
@@ -1,6 +1,7 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,TLS_ABI,TLS_ABI_LEGACY
 ; RUN: opt < %s -dfsan -dfsan-args-abi -S | FileCheck %s --check-prefixes=CHECK,ARGS_ABI
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,TLS_ABI,TLS_ABI_FAST
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK,TLS_ABI,TLS_ABI_FAST
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"