Index: include/clang/Analysis/Analyses/FormatString.h =================================================================== --- include/clang/Analysis/Analyses/FormatString.h +++ include/clang/Analysis/Analyses/FormatString.h @@ -210,6 +210,7 @@ unsigned getLength() const { return EndScanList ? EndScanList - Position : 1; } + void setEndScanList(const char *pos) { EndScanList = pos; } bool isIntArg() const { return (kind >= IntArgBeg && kind <= IntArgEnd) || kind == FreeBSDrArg || kind == FreeBSDyArg; } @@ -413,11 +414,6 @@ bool isObjCArg() const { return kind >= ObjCBeg && kind <= ObjCEnd; } bool isDoubleArg() const { return kind >= DoubleArgBeg && kind <= DoubleArgEnd; } - unsigned getLength() const { - // Conversion specifiers currently only are represented by - // single characters, but we be flexible. - return 1; - } static bool classof(const analyze_format_string::ConversionSpecifier *CS) { return CS->isPrintfKind(); @@ -546,8 +542,6 @@ ScanfConversionSpecifier(const char *pos, Kind k) : ConversionSpecifier(false, pos, k) {} - void setEndScanList(const char *pos) { EndScanList = pos; } - static bool classof(const analyze_format_string::ConversionSpecifier *CS) { return !CS->isPrintfKind(); } Index: lib/Analysis/FormatString.cpp =================================================================== --- lib/Analysis/FormatString.cpp +++ lib/Analysis/FormatString.cpp @@ -15,6 +15,8 @@ #include "FormatStringParsing.h" #include "clang/Basic/LangOptions.h" #include "clang/Basic/TargetInfo.h" +#include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/Locale.h" using clang::analyze_format_string::ArgType; using clang::analyze_format_string::FormatStringHandler; @@ -260,6 +262,26 @@ return true; } +bool clang::analyze_format_string::ParseUTF8InvalidSpecifier( + const char *SpecifierBegin, const char *FmtStrEnd, unsigned &Len) { + if (SpecifierBegin + 1 >= FmtStrEnd) + return false; + + const UTF8 *SB = reinterpret_cast(SpecifierBegin + 1); + const UTF8 *SE = reinterpret_cast(FmtStrEnd); + const char FirstByte = *SB; + + // If the specifier is non-printable, it could be the first byte of a + // UTF-8 sequence. If that's the case, adjust the length accordingly. + if (llvm::sys::locale::isPrint(FirstByte)) + return false; + if (!isLegalUTF8String(&SB, SE)) + return false; + + Len = getNumBytesForUTF8(FirstByte) + 1; + return true; +} + //===----------------------------------------------------------------------===// // Methods on ArgType. //===----------------------------------------------------------------------===// Index: lib/Analysis/FormatStringParsing.h =================================================================== --- lib/Analysis/FormatStringParsing.h +++ lib/Analysis/FormatStringParsing.h @@ -46,7 +46,13 @@ /// FormatSpecifier& argument, and false otherwise. bool ParseLengthModifier(FormatSpecifier &FS, const char *&Beg, const char *E, const LangOptions &LO, bool IsScanf = false); - + +/// Returns true if the invalid specifier in \p SpecifierBegin is a UTF-8 +/// string; check that it won't go further than \p FmtStrEnd and write +/// up the total size in \p Len. +bool ParseUTF8InvalidSpecifier(const char *SpecifierBegin, + const char *FmtStrEnd, unsigned &Len); + template class SpecifierResult { T FS; const char *Start; Index: lib/Analysis/PrintfFormatString.cpp =================================================================== --- lib/Analysis/PrintfFormatString.cpp +++ lib/Analysis/PrintfFormatString.cpp @@ -312,8 +312,13 @@ argIndex++; if (k == ConversionSpecifier::InvalidSpecifier) { + unsigned Len = I - Start; + if (ParseUTF8InvalidSpecifier(Start, E, Len)) { + CS.setEndScanList(Start + Len); + FS.setConversionSpecifier(CS); + } // Assume the conversion takes one argument. - return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, I - Start); + return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, Len); } return PrintfSpecifierResult(Start, FS); } Index: lib/Analysis/ScanfFormatString.cpp =================================================================== --- lib/Analysis/ScanfFormatString.cpp +++ lib/Analysis/ScanfFormatString.cpp @@ -79,7 +79,7 @@ unsigned &argIndex, const LangOptions &LO, const TargetInfo &Target) { - + using namespace clang::analyze_format_string; using namespace clang::analyze_scanf; const char *I = Beg; const char *Start = nullptr; @@ -210,10 +210,15 @@ // FIXME: '%' and '*' doesn't make sense. Issue a warning. // FIXME: 'ConsumedSoFar' and '*' doesn't make sense. - + if (k == ScanfConversionSpecifier::InvalidSpecifier) { + unsigned Len = I - Beg; + if (ParseUTF8InvalidSpecifier(Beg, E, Len)) { + CS.setEndScanList(Beg + Len); + FS.setConversionSpecifier(CS); + } // Assume the conversion takes one argument. - return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, I - Beg); + return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, Len); } return ScanfSpecifierResult(Start, FS); } Index: lib/Sema/SemaChecking.cpp =================================================================== --- lib/Sema/SemaChecking.cpp +++ lib/Sema/SemaChecking.cpp @@ -36,6 +36,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallString.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/Locale.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/raw_ostream.h" #include @@ -3976,12 +3978,41 @@ // gibberish when trying to match arguments. keepGoing = false; } - - EmitFormatDiagnostic(S.PDiag(diag::warn_format_invalid_conversion) - << StringRef(csStart, csLen), - Loc, /*IsStringLocation*/true, - getSpecifierRange(startSpec, specifierLen)); - + + StringRef Specifier(csStart, csLen); + + // If the specifier in non-printable, it could be the first byte of a UTF-8 + // sequence. In that case, print the UTF-8 code point. If not, print the byte + // hex value. + std::string CodePointStr; + if (!llvm::sys::locale::isPrint(*csStart)) { + UTF32 CodePoint; + const UTF8 **B = reinterpret_cast(&csStart); + const UTF8 *E = + reinterpret_cast(csStart + csLen); + ConversionResult Result = + llvm::convertUTF8Sequence(B, E, &CodePoint, strictConversion); + + if (Result != conversionOK) { + unsigned char FirstChar = *csStart; + CodePoint = (UTF32)FirstChar; + } + + llvm::raw_string_ostream OS(CodePointStr); + if (CodePoint < 256) + OS << "\\x" << llvm::format("%02x", CodePoint); + else if (CodePoint <= 0xFFFF) + OS << "\\u" << llvm::format("%04x", CodePoint); + else + OS << "\\U" << llvm::format("%08x", CodePoint); + OS.flush(); + Specifier = CodePointStr; + } + + EmitFormatDiagnostic( + S.PDiag(diag::warn_format_invalid_conversion) << Specifier, Loc, + /*IsStringLocation*/ true, getSpecifierRange(startSpec, specifierLen)); + return keepGoing; } Index: test/SemaObjC/format-strings-objc.m =================================================================== --- test/SemaObjC/format-strings-objc.m +++ test/SemaObjC/format-strings-objc.m @@ -265,3 +265,16 @@ NSLog(@"%2$[tt]@ %1$[tt]s", @"Foo", @"Bar"); // expected-warning {{object format flags cannot be used with 's' conversion specifier}} } +// Test Objective-C invalid no printable specifiers +void testObjcInvalidNoPrintable(int *a) { + NSLog(@"%\u25B9", 3); // expected-warning {{invalid conversion specifier '\u25b9'}} + NSLog(@"%\xE2\x96\xB9", 3); // expected-warning {{invalid conversion specifier '\u25b9'}} + NSLog(@"%\U00010348", 42); // expected-warning {{invalid conversion specifier '\U00010348'}} + NSLog(@"%\xF0\x90\x8D\x88", 42); // expected-warning {{invalid conversion specifier '\U00010348'}} + NSLog(@"%\xe2", @"Foo"); // expected-warning {{input conversion stopped}} expected-warning {{invalid conversion specifier '\xe2'}} + scanf("%\u25B9", a); // expected-warning {{implicitly declaring library}} expected-note {{include the header}} expected-warning {{invalid conversion specifier '\u25b9'}} + scanf("%\xE2\x96\xB9", a); // expected-warning {{invalid conversion specifier '\u25b9'}} + scanf("%\U00010348", a); // expected-warning {{invalid conversion specifier '\U00010348'}} + scanf("%\xF0\x90\x8D\x88", a); // expected-warning {{invalid conversion specifier '\U00010348'}} + scanf("%\xe2", a); // expected-warning {{invalid conversion specifier '\xe2'}} +}