summaryrefslogtreecommitdiffstats
path: root/clang/lib
diff options
context:
space:
mode:
Diffstat (limited to 'clang/lib')
-rw-r--r--clang/lib/AST/Expr.cpp3
-rw-r--r--clang/lib/Lex/LiteralSupport.cpp80
-rw-r--r--clang/lib/Sema/SemaChecking.cpp2
3 files changed, 74 insertions, 11 deletions
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 9d7a93a429e..d3f6a521f1c 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -679,7 +679,8 @@ void StringLiteral::setString(ASTContext &C, StringRef Str,
SourceLocation StringLiteral::
getLocationOfByte(unsigned ByteNo, const SourceManager &SM,
const LangOptions &Features, const TargetInfo &Target) const {
- assert(Kind == StringLiteral::Ascii && "This only works for ASCII strings");
+ assert((Kind == StringLiteral::Ascii || Kind == StringLiteral::UTF8) &&
+ "Only narrow string literals are currently supported");
// Loop over all of the tokens in this string until we find the one that
// contains the byte we're looking for.
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index c7120f2befb..2930d6a5ff0 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -250,6 +250,39 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
return true;
}
+/// MeasureUCNEscape - Determine the number of bytes within the resulting string
+/// which this UCN will occupy.
+static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
+ const char *ThisTokEnd, unsigned CharByteWidth,
+ const LangOptions &Features, bool &HadError) {
+ // UTF-32: 4 bytes per escape.
+ if (CharByteWidth == 4)
+ return 4;
+
+ uint32_t UcnVal = 0;
+ unsigned short UcnLen = 0;
+ FullSourceLoc Loc;
+
+ if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
+ UcnLen, Loc, 0, Features, true)) {
+ HadError = true;
+ return 0;
+ }
+
+ // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
+ if (CharByteWidth == 2)
+ return UcnVal <= 0xFFFF ? 2 : 4;
+
+ // UTF-8.
+ if (UcnVal < 0x80)
+ return 1;
+ if (UcnVal < 0x800)
+ return 2;
+ if (UcnVal < 0x10000)
+ return 3;
+ return 4;
+}
+
/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
/// StringLiteralParser. When we decide to implement UCN's for identifiers,
@@ -265,7 +298,7 @@ static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
unsigned short UcnLen = 0;
if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
Loc, Diags, Features, true)) {
- HadError = 1;
+ HadError = true;
return;
}
@@ -1369,14 +1402,31 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
if (StringInvalid)
return 0;
+ const char *SpellingStart = SpellingPtr;
+ const char *SpellingEnd = SpellingPtr+TokLen;
+
+ // Handle UTF-8 strings just like narrow strings.
+ if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
+ SpellingPtr += 2;
+
assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
+ // For raw string literals, this is easy.
+ if (SpellingPtr[0] == 'R') {
+ assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
+ // Skip 'R"'.
+ SpellingPtr += 2;
+ while (*SpellingPtr != '(') {
+ ++SpellingPtr;
+ assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
+ }
+ // Skip '('.
+ ++SpellingPtr;
+ return SpellingPtr - SpellingStart + ByteNo;
+ }
- const char *SpellingStart = SpellingPtr;
- const char *SpellingEnd = SpellingPtr+TokLen;
-
- // Skip over the leading quote.
+ // Skip over the leading quote
assert(SpellingPtr[0] == '"' && "Should be a string literal!");
++SpellingPtr;
@@ -1393,11 +1443,23 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
// Otherwise, this is an escape character. Advance over it.
bool HadError = false;
- ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
- FullSourceLoc(Tok.getLocation(), SM),
- CharByteWidth*8, Diags);
+ if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
+ const char *EscapePtr = SpellingPtr;
+ unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
+ 1, Features, HadError);
+ if (Len > ByteNo) {
+ // ByteNo is somewhere within the escape sequence.
+ SpellingPtr = EscapePtr;
+ break;
+ }
+ ByteNo -= Len;
+ } else {
+ ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
+ FullSourceLoc(Tok.getLocation(), SM),
+ CharByteWidth*8, Diags);
+ --ByteNo;
+ }
assert(!HadError && "This method isn't valid on erroneous strings");
- --ByteNo;
}
return SpellingPtr-SpellingStart;
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index e818f5f3e07..06035921d98 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2633,7 +2633,7 @@ void Sema::CheckFormatString(const StringLiteral *FExpr,
bool inFunctionCall) {
// CHECK: is the format string a wide literal?
- if (!FExpr->isAscii()) {
+ if (!FExpr->isAscii() && !FExpr->isUTF8()) {
CheckFormatHandler::EmitFormatDiagnostic(
*this, inFunctionCall, Args[format_idx],
PDiag(diag::warn_format_string_is_wide_literal), FExpr->getLocStart(),
OpenPOWER on IntegriCloud