From 9a67f47882901e4cd3ab16935a7ca9a59a2e3c97 Mon Sep 17 00:00:00 2001 From: Richard Smith Date: Wed, 28 Nov 2012 07:29:00 +0000 Subject: Teach Lexer::getSpelling about raw string literals. Specifically, if a raw string literal needs cleaning (because it contains line-splicing in the encoding prefix or in the ud-suffix), do not clean the section between the double-quotes -- that's the "raw" bit! llvm-svn: 168776 --- clang/lib/Lex/Lexer.cpp | 109 +++++++++++++++++++++++++++++------------------- 1 file changed, 67 insertions(+), 42 deletions(-) (limited to 'clang/lib/Lex/Lexer.cpp') diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp index 4698e288c03..6cd18469e4c 100644 --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -233,16 +233,67 @@ void Lexer::Stringify(SmallVectorImpl &Str) { // Token Spelling //===----------------------------------------------------------------------===// +/// \brief Slow case of getSpelling. Extract the characters comprising the +/// spelling of this token from the provided input buffer. +static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, + const LangOptions &LangOpts, char *Spelling) { + assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); + + size_t Length = 0; + const char *BufEnd = BufPtr + Tok.getLength(); + + if (Tok.is(tok::string_literal)) { + // Munch the encoding-prefix and opening double-quote. + while (BufPtr < BufEnd) { + unsigned Size; + Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); + BufPtr += Size; + + if (Spelling[Length - 1] == '"') + break; + } + + // Raw string literals need special handling; trigraph expansion and line + // splicing do not occur within their d-char-sequence nor within their + // r-char-sequence. + if (Length >= 2 && + Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { + // Search backwards from the end of the token to find the matching closing + // quote. + const char *RawEnd = BufEnd; + do --RawEnd; while (*RawEnd != '"'); + size_t RawLength = RawEnd - BufPtr + 1; + + // Everything between the quotes is included verbatim in the spelling. + memcpy(Spelling + Length, BufPtr, RawLength); + Length += RawLength; + BufPtr += RawLength; + + // The rest of the token is lexed normally. + } + } + + while (BufPtr < BufEnd) { + unsigned Size; + Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); + BufPtr += Size; + } + + assert(Length < Tok.getLength() && + "NeedsCleaning flag set on token that didn't need cleaning!"); + return Length; +} + /// getSpelling() - Return the 'spelling' of this token. The spelling of a /// token are the characters used to represent the token in the source file /// after trigraph expansion and escaped-newline folding. In particular, this /// wants to get the true, uncanonicalized, spelling of things like digraphs /// UCNs, etc. StringRef Lexer::getSpelling(SourceLocation loc, - SmallVectorImpl &buffer, - const SourceManager &SM, - const LangOptions &options, - bool *invalid) { + SmallVectorImpl &buffer, + const SourceManager &SM, + const LangOptions &options, + bool *invalid) { // Break down the source location. std::pair locInfo = SM.getDecomposedLoc(loc); @@ -267,17 +318,10 @@ StringRef Lexer::getSpelling(SourceLocation loc, // Common case: no need for cleaning. if (!token.needsCleaning()) return StringRef(tokenBegin, length); - - // Hard case, we need to relex the characters into the string. - buffer.clear(); - buffer.reserve(length); - - for (const char *ti = tokenBegin, *te = ti + length; ti != te; ) { - unsigned charSize; - buffer.push_back(Lexer::getCharAndSizeNoWarn(ti, charSize, options)); - ti += charSize; - } + // Hard case, we need to relex the characters into the string. + buffer.resize(length); + buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); return StringRef(buffer.data(), buffer.size()); } @@ -289,31 +333,22 @@ StringRef Lexer::getSpelling(SourceLocation loc, std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid) { assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); - - // If this token contains nothing interesting, return it directly. + bool CharDataInvalid = false; - const char* TokStart = SourceMgr.getCharacterData(Tok.getLocation(), + const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); if (Invalid) *Invalid = CharDataInvalid; if (CharDataInvalid) return std::string(); - + + // If this token contains nothing interesting, return it directly. if (!Tok.needsCleaning()) - return std::string(TokStart, TokStart+Tok.getLength()); - + return std::string(TokStart, TokStart + Tok.getLength()); + std::string Result; - Result.reserve(Tok.getLength()); - - // Otherwise, hard case, relex the characters into the string. - for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); - Ptr != End; ) { - unsigned CharSize; - Result.push_back(Lexer::getCharAndSizeNoWarn(Ptr, CharSize, LangOpts)); - Ptr += CharSize; - } - assert(Result.size() != unsigned(Tok.getLength()) && - "NeedsCleaning flag set on something that didn't need cleaning!"); + Result.resize(Tok.getLength()); + Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); return Result; } @@ -365,17 +400,7 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, } // Otherwise, hard case, relex the characters into the string. - char *OutBuf = const_cast(Buffer); - for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); - Ptr != End; ) { - unsigned CharSize; - *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, LangOpts); - Ptr += CharSize; - } - assert(unsigned(OutBuf-Buffer) != Tok.getLength() && - "NeedsCleaning flag set on something that didn't need cleaning!"); - - return OutBuf-Buffer; + return getSpellingSlow(Tok, TokStart, LangOpts, const_cast(Buffer)); } -- cgit v1.2.3