Index: include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h =================================================================== --- include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h +++ include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h @@ -43,6 +43,9 @@ ProgramStateManager &, SubEngine *); typedef std::unique_ptr(*StoreManagerCreator)( ProgramStateManager &); +typedef llvm::ImmutableMap TaintedSubRegions; +typedef llvm::ImmutableMapRef + TaintedSubRegionsRef; //===----------------------------------------------------------------------===// // ProgramStateTrait - Traits used by the Generic Data Map of a ProgramState. @@ -87,6 +90,7 @@ Store store; // Maps a location to its current value. GenericDataMap GDM; // Custom data stored by a client of this class. unsigned refCount; + TaintedSubRegions::Factory TSRFactory; /// makeWithStore - Return a ProgramState with the same values as the current /// state with the exception of using the specified Store. @@ -343,6 +347,9 @@ ProgramStateRef addTaint(const Stmt *S, const LocationContext *LCtx, TaintTagType Kind = TaintTagGeneric) const; + /// Create a new state in which the value is marked as tainted. + ProgramStateRef addTaint(SVal V, TaintTagType Kind = TaintTagGeneric) const; + /// Create a new state in which the symbol is marked as tainted. ProgramStateRef addTaint(SymbolRef S, TaintTagType Kind = TaintTagGeneric) const; @@ -351,6 +358,14 @@ ProgramStateRef addTaint(const MemRegion *R, TaintTagType Kind = TaintTagGeneric) const; + /// Create a new state in a which a sub-region of a given symbol is tainted. + /// This might be necessary when referring to regions that can not have an + /// individual symbol, e.g. if they are represented by the default binding of + /// a LazyCompoundVal. + ProgramStateRef addPartialTaint(SymbolRef ParentSym, + const SubRegion *SubRegion, + TaintTagType Kind = TaintTagGeneric) const; + /// Check if the statement is tainted in the current state. bool isTainted(const Stmt *S, const LocationContext *LCtx, TaintTagType Kind = TaintTagGeneric) const; Index: include/clang/StaticAnalyzer/Core/PathSensitive/TaintManager.h =================================================================== --- include/clang/StaticAnalyzer/Core/PathSensitive/TaintManager.h +++ include/clang/StaticAnalyzer/Core/PathSensitive/TaintManager.h @@ -35,6 +35,16 @@ static void *GDMIndex() { static int index = 0; return &index; } }; +/// The GDM component mapping derived symbols' parent symbols to their +/// underlying regions. This is used to efficiently check whether a symbol is +/// tainted when it represents a sub-region of a tainted symbol. +struct DerivedSymTaint {}; +typedef llvm::ImmutableMap DerivedSymTaintImpl; +template<> struct ProgramStateTrait + : public ProgramStatePartialTrait { + static void *GDMIndex() { static int index; return &index; } +}; + class TaintManager { TaintManager() {} Index: lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp =================================================================== --- lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp +++ lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp @@ -65,21 +65,8 @@ /// and thus, is tainted. static bool isStdin(const Expr *E, CheckerContext &C); - /// This is called from getPointedToSymbol() to resolve symbol references for - /// the region underlying a LazyCompoundVal. This is the default binding - /// for the LCV, which could be a conjured symbol from a function call that - /// initialized the region. It only returns the conjured symbol if the LCV - /// covers the entire region, e.g. we avoid false positives by not returning - /// a default bindingc for an entire struct if the symbol for only a single - /// field or element within it is requested. - // TODO: Return an appropriate symbol for sub-fields/elements of an LCV so - // that they are also appropriately tainted. - static SymbolRef getLCVSymbol(CheckerContext &C, - nonloc::LazyCompoundVal &LCV); - - /// \brief Given a pointer argument, get the symbol of the value it contains - /// (points to). - static SymbolRef getPointedToSymbol(CheckerContext &C, const Expr *Arg); + /// \brief Given a pointer argument, return the value it points to. + static Optional getPointedToSVal(CheckerContext &C, const Expr *Arg); /// Functions defining the attack surface. typedef ProgramStateRef (GenericTaintChecker::*FnCheck)(const CallExpr *, @@ -186,9 +173,14 @@ static inline bool isTaintedOrPointsToTainted(const Expr *E, ProgramStateRef State, CheckerContext &C) { - return (State->isTainted(E, C.getLocationContext()) || isStdin(E, C) || - (E->getType().getTypePtr()->isPointerType() && - State->isTainted(getPointedToSymbol(C, E)))); + if (State->isTainted(E, C.getLocationContext()) || isStdin(E, C)) + return true; + + if (!E->getType().getTypePtr()->isPointerType()) + return false; + + Optional V = getPointedToSVal(C, E); + return (V && State->isTainted(*V)); } /// \brief Pre-process a function which propagates taint according to the @@ -400,9 +392,9 @@ if (CE->getNumArgs() < (ArgNum + 1)) return false; const Expr* Arg = CE->getArg(ArgNum); - SymbolRef Sym = getPointedToSymbol(C, Arg); - if (Sym) - State = State->addTaint(Sym); + Optional V = getPointedToSVal(C, Arg); + if (V) + State = State->addTaint(*V); } // Clear up the taint info from the state. @@ -473,47 +465,20 @@ return false; } -SymbolRef GenericTaintChecker::getLCVSymbol(CheckerContext &C, - nonloc::LazyCompoundVal &LCV) { - StoreManager &StoreMgr = C.getStoreManager(); - - // getLCVSymbol() is reached in a PostStmt so we can always expect a default - // binding to exist if one is present. - if (Optional binding = StoreMgr.getDefaultBinding(LCV)) { - SymbolRef Sym = binding->getAsSymbol(); - if (!Sym) - return nullptr; - - // If the LCV covers an entire base region return the default conjured symbol. - if (LCV.getRegion() == LCV.getRegion()->getBaseRegion()) - return Sym; - } - - // Otherwise, return a nullptr as there's not yet a functional way to taint - // sub-regions of LCVs. - return nullptr; -} - -SymbolRef GenericTaintChecker::getPointedToSymbol(CheckerContext &C, - const Expr* Arg) { +Optional GenericTaintChecker::getPointedToSVal(CheckerContext &C, + const Expr* Arg) { ProgramStateRef State = C.getState(); SVal AddrVal = State->getSVal(Arg->IgnoreParens(), C.getLocationContext()); if (AddrVal.isUnknownOrUndef()) - return nullptr; + return None; Optional AddrLoc = AddrVal.getAs(); if (!AddrLoc) - return nullptr; + return None; const PointerType *ArgTy = dyn_cast(Arg->getType().getCanonicalType().getTypePtr()); - SVal Val = State->getSVal(*AddrLoc, - ArgTy ? ArgTy->getPointeeType(): QualType()); - - if (auto LCV = Val.getAs()) - return getLCVSymbol(C, *LCV); - - return Val.getAsSymbol(); + return State->getSVal(*AddrLoc, ArgTy ? ArgTy->getPointeeType(): QualType()); } ProgramStateRef @@ -633,9 +598,9 @@ // The arguments are pointer arguments. The data they are pointing at is // tainted after the call. const Expr* Arg = CE->getArg(i); - SymbolRef Sym = getPointedToSymbol(C, Arg); - if (Sym) - State = State->addTaint(Sym); + Optional V = getPointedToSVal(C, Arg); + if (V) + State = State->addTaint(*V); } return State; } @@ -710,10 +675,10 @@ // Check for taint. ProgramStateRef State = C.getState(); - const SymbolRef PointedToSym = getPointedToSymbol(C, E); + Optional PointedToSVal = getPointedToSVal(C, E); SVal TaintedSVal; - if (State->isTainted(PointedToSym)) - TaintedSVal = nonloc::SymbolVal(PointedToSym); + if (PointedToSVal && State->isTainted(*PointedToSVal)) + TaintedSVal = *PointedToSVal; else if (State->isTainted(E, C.getLocationContext())) TaintedSVal = C.getSVal(E); else Index: lib/StaticAnalyzer/Core/ProgramState.cpp =================================================================== --- lib/StaticAnalyzer/Core/ProgramState.cpp +++ lib/StaticAnalyzer/Core/ProgramState.cpp @@ -644,15 +644,33 @@ if (const Expr *E = dyn_cast_or_null(S)) S = E->IgnoreParens(); - SymbolRef Sym = getSVal(S, LCtx).getAsSymbol(); + return addTaint(getSVal(S, LCtx), Kind); +} + +ProgramStateRef ProgramState::addTaint(SVal V, + TaintTagType Kind) const { + SymbolRef Sym = V.getAsSymbol(); if (Sym) return addTaint(Sym, Kind); - const MemRegion *R = getSVal(S, LCtx).getAsRegion(); - addTaint(R, Kind); + // If the SVal represents a structure, try to mass-taint all values within the + // structure. For now it only works efficiently on lazy compound values that + // were conjured during a conservative evaluation of a function - either as + // return values of functions that return structures or arrays by value, or as + // values of structures or arrays passed into the function by reference, + // directly or through pointer aliasing. Such lazy compound values are + // characterized by having exactly one binding in their captured store within + // their parent region, which is a conjured symbol default-bound to the base + // region of the parent region. + if (auto LCV = V.getAs()) { + if (Optional binding = getStateManager().StoreMgr->getDefaultBinding(*LCV)) { + if (SymbolRef Sym = binding->getAsSymbol()) + return addPartialTaint(Sym, LCV->getRegion(), Kind); + } + } - // Cannot add taint, so just return the state. - return this; + const MemRegion *R = V.getAsRegion(); + return addTaint(R, Kind); } ProgramStateRef ProgramState::addTaint(const MemRegion *R, @@ -674,6 +692,28 @@ return NewState; } +ProgramStateRef ProgramState::addPartialTaint(SymbolRef ParentSym, + const SubRegion *SubRegion, + TaintTagType Kind) const { + // Ignore partial taint if the entire parent symbol is already tainted. + if (contains(ParentSym) && *get(ParentSym) == Kind) + return this; + + // Partial taint applies if only a portion of the symbol is tainted. + if (SubRegion == SubRegion->getBaseRegion()) + return addTaint(ParentSym, Kind); + + TaintedSubRegionsRef TaintedSubRegions(0, TSRFactory.getTreeFactory()); + if (const TaintedSubRegionsRef *SavedTaintedRegions = + get(ParentSym)) + TaintedSubRegions = *SavedTaintedRegions; + + TaintedSubRegions = TaintedSubRegions.add(SubRegion, Kind); + ProgramStateRef NewState = set(ParentSym, TaintedSubRegions); + assert(NewState); + return NewState; +} + bool ProgramState::isTainted(const Stmt *S, const LocationContext *LCtx, TaintTagType Kind) const { if (const Expr *E = dyn_cast_or_null(S)) @@ -714,31 +754,54 @@ return false; // Traverse all the symbols this symbol depends on to see if any are tainted. - bool Tainted = false; for (SymExpr::symbol_iterator SI = Sym->symbol_begin(), SE =Sym->symbol_end(); SI != SE; ++SI) { if (!isa(*SI)) continue; - const TaintTagType *Tag = get(*SI); - Tainted = (Tag && *Tag == Kind); + if (const TaintTagType *Tag = get(*SI)) { + if (*Tag == Kind) + return true; + } - // If this is a SymbolDerived with a tainted parent, it's also tainted. - if (const SymbolDerived *SD = dyn_cast(*SI)) - Tainted = Tainted || isTainted(SD->getParentSymbol(), Kind); + if (const SymbolDerived *SD = dyn_cast(*SI)) { + // If this is a SymbolDerived with a tainted parent, it's also tainted. + if (isTainted(SD->getParentSymbol(), Kind)) + return true; + + // If this is a SymbolDerived with the same parent symbol as another + // tainted SymbolDerived and a region that's a sub-region of that tainted + // symbol, it's also tainted. + if (const TaintedSubRegionsRef *SymRegions = + get(SD->getParentSymbol())) { + const TypedValueRegion *R = SD->getRegion(); + for (TaintedSubRegionsRef::iterator I = SymRegions->begin(), + E = SymRegions->end(); + I != E; ++I) { + // FIXME: The logic to identify tainted regions could be more + // complete. For example, this would not currently identify + // overlapping fields in a union as tainted. To identify this we can + // check for overlapping/nested byte offsets. + if (Kind == I->second && + (R == I->first || R->isSubRegionOf(I->first))) + return true; + } + } + } // If memory region is tainted, data is also tainted. - if (const SymbolRegionValue *SRV = dyn_cast(*SI)) - Tainted = Tainted || isTainted(SRV->getRegion(), Kind); - - // If If this is a SymbolCast from a tainted value, it's also tainted. - if (const SymbolCast *SC = dyn_cast(*SI)) - Tainted = Tainted || isTainted(SC->getOperand(), Kind); + if (const SymbolRegionValue *SRV = dyn_cast(*SI)) { + if (isTainted(SRV->getRegion(), Kind)) + return true; + } - if (Tainted) - return true; + // If this is a SymbolCast from a tainted value, it's also tainted. + if (const SymbolCast *SC = dyn_cast(*SI)) { + if (isTainted(SC->getOperand(), Kind)) + return true; + } } - return Tainted; + return false; } Index: lib/StaticAnalyzer/Core/RegionStore.cpp =================================================================== --- lib/StaticAnalyzer/Core/RegionStore.cpp +++ lib/StaticAnalyzer/Core/RegionStore.cpp @@ -496,7 +496,10 @@ Optional getDefaultBinding(Store S, const MemRegion *R) override { RegionBindingsRef B = getRegionBindings(S); - return B.getDefaultBinding(R); + // Default bindings are always applied over a base region so look up the + // base region's default binding, otherwise the lookup will fail when R + // is at an offset from R->getBaseRegion(). + return B.getDefaultBinding(R->getBaseRegion()); } SVal getBinding(RegionBindingsConstRef B, Loc L, QualType T = QualType()); Index: test/Analysis/taint-generic.c =================================================================== --- test/Analysis/taint-generic.c +++ test/Analysis/taint-generic.c @@ -192,20 +192,41 @@ void testStructArray() { struct { - char buf[16]; - struct { - int length; - } st[1]; - } tainted; + int length; + } tainted[4]; - char buffer[16]; + char dstbuf[16], srcbuf[16]; int sock; sock = socket(AF_INET, SOCK_STREAM, 0); - read(sock, &tainted.buf[0], sizeof(tainted.buf)); - read(sock, &tainted.st[0], sizeof(tainted.st)); - // FIXME: tainted.st[0].length should be marked tainted - __builtin_memcpy(buffer, tainted.buf, tainted.st[0].length); // no-warning + __builtin_memset(srcbuf, 0, sizeof(srcbuf)); + + read(sock, &tainted[0], sizeof(tainted)); + __builtin_memcpy(dstbuf, srcbuf, tainted[0].length); // expected-warning {{Untrusted data is used to specify the buffer size}} + + __builtin_memset(&tainted, 0, sizeof(tainted)); + read(sock, &tainted, sizeof(tainted)); + __builtin_memcpy(dstbuf, srcbuf, tainted[0].length); // expected-warning {{Untrusted data is used to specify the buffer size}} + + __builtin_memset(&tainted, 0, sizeof(tainted)); + // If we taint element 1, we should not raise an alert on taint for element 0 or element 2 + read(sock, &tainted[1], sizeof(tainted)); + __builtin_memcpy(dstbuf, srcbuf, tainted[0].length); // no-warning + __builtin_memcpy(dstbuf, srcbuf, tainted[2].length); // no-warning +} + +void testUnion() { + union { + int x; + char y[4]; + } tainted; + + char buffer[4]; + + int sock = socket(AF_INET, SOCK_STREAM, 0); + read(sock, &tainted.y, sizeof(tainted.y)); + // FIXME: overlapping regions aren't detected by isTainted yet + __builtin_memcpy(buffer, tainted.y, tainted.x); } int testDivByZero() {