Index: lib/Analysis/PrintfFormatString.cpp =================================================================== --- lib/Analysis/PrintfFormatString.cpp +++ lib/Analysis/PrintfFormatString.cpp @@ -15,6 +15,8 @@ #include "clang/Analysis/Analyses/FormatString.h" #include "FormatStringParsing.h" #include "clang/Basic/TargetInfo.h" +#include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/Locale.h" using clang::analyze_format_string::ArgType; using clang::analyze_format_string::FormatStringHandler; @@ -312,8 +314,19 @@ argIndex++; if (k == ConversionSpecifier::InvalidSpecifier) { + const UTF8 *SB = reinterpret_cast(Start + 1); + const UTF8 *SE = reinterpret_cast(E); + const char FirstByte = *SB; + unsigned Len = I - Start; + + // If the specifier in non-printable, it could be the first byte of a + // UTF-8 sequence. If that's the case, adjust the length accordingly. + if (Start + 1 < I && !llvm::sys::locale::isPrint(FirstByte) && + isLegalUTF8String(&SB, SE)) + Len = getNumBytesForUTF8(FirstByte) + 1; + // Assume the conversion takes one argument. - return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, I - Start); + return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, Len); } return PrintfSpecifierResult(Start, FS); } Index: lib/Sema/SemaChecking.cpp =================================================================== --- lib/Sema/SemaChecking.cpp +++ lib/Sema/SemaChecking.cpp @@ -36,6 +36,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallString.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/Locale.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/raw_ostream.h" #include @@ -3976,12 +3978,41 @@ // gibberish when trying to match arguments. keepGoing = false; } - - EmitFormatDiagnostic(S.PDiag(diag::warn_format_invalid_conversion) - << StringRef(csStart, csLen), - Loc, /*IsStringLocation*/true, - getSpecifierRange(startSpec, specifierLen)); - + + StringRef Specifier(csStart, csLen); + + // If the specifier in non-printable, it could be the first byte of a UTF-8 + // sequence. In that case, print the UTF-8 code point. If not, print the byte + // hex value. + std::string CodePointStr; + if (!llvm::sys::locale::isPrint(*csStart)) { + UTF32 CodePoint; + const UTF8 **B = reinterpret_cast(&csStart); + const UTF8 *E = + reinterpret_cast(csStart + specifierLen - csLen); + ConversionResult Result = + llvm::convertUTF8Sequence(B, E, &CodePoint, strictConversion); + + if (Result != conversionOK) { + unsigned char FirstChar = *csStart; + CodePoint = (UTF32)FirstChar; + } + + llvm::raw_string_ostream OS(CodePointStr); + if (CodePoint < 256) + OS << "\\x" << llvm::format("%02x", CodePoint); + else if (CodePoint <= 0xFFFF) + OS << "\\u" << llvm::format("%04x", CodePoint); + else + OS << "\\U" << llvm::format("%08x", CodePoint); + OS.flush(); + Specifier = CodePointStr; + } + + EmitFormatDiagnostic( + S.PDiag(diag::warn_format_invalid_conversion) << Specifier, Loc, + /*IsStringLocation*/ true, getSpecifierRange(startSpec, specifierLen)); + return keepGoing; } Index: test/SemaObjC/format-strings-objc.m =================================================================== --- test/SemaObjC/format-strings-objc.m +++ test/SemaObjC/format-strings-objc.m @@ -265,3 +265,11 @@ NSLog(@"%2$[tt]@ %1$[tt]s", @"Foo", @"Bar"); // expected-warning {{object format flags cannot be used with 's' conversion specifier}} } +// Test Objective-C invalid no printable specifiers +void testObjcInvalidNoPrintable() { + NSLog(@"%\u25B9", 3); // expected-warning {{invalid conversion specifier '\u25b9'}} + NSLog(@"%\xE2\x96\xB9", 3); // expected-warning {{invalid conversion specifier '\u25b9'}} + NSLog(@"%\U00010348", 42); // expected-warning {{invalid conversion specifier '\U00010348'}} + NSLog(@"%\xF0\x90\x8D\x88", 42); // expected-warning {{invalid conversion specifier '\U00010348'}} + NSLog(@"%\xe2", @"Foo"); // expected-warning {{input conversion stopped}} expected-warning {{invalid conversion specifier '\xe2'}} +}