diff options
Diffstat (limited to 'llvm/lib/TableGen/TGLexer.cpp')
-rw-r--r-- | llvm/lib/TableGen/TGLexer.cpp | 557 |
1 files changed, 535 insertions, 22 deletions
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index 652be6e8dbb..fcabce5329c 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -19,6 +19,7 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SourceMgr.h" #include "llvm/TableGen/Error.h" +#include <algorithm> #include <cctype> #include <cerrno> #include <cstdint> @@ -28,11 +29,35 @@ using namespace llvm; -TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) { +namespace { +// A list of supported preprocessing directives with their +// internal token kinds and names. +struct { + tgtok::TokKind Kind; + const char *Word; +} PreprocessorDirs[] = { + { tgtok::Ifdef, "ifdef" }, + { tgtok::Else, "else" }, + { tgtok::Endif, "endif" }, + { tgtok::Define, "define" } +}; +} // end anonymous namespace + +TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) { CurBuffer = SrcMgr.getMainFileID(); CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); CurPtr = CurBuf.begin(); TokStart = nullptr; + + // Pretend that we enter the "top-level" include file. + PrepIncludeStack.push_back( + make_unique<std::vector<PreprocessorControlDesc>>()); + + // Put all macros defined in the command line into the DefinedMacros set. + std::for_each(Macros.begin(), Macros.end(), + [this](const std::string &MacroName) { + DefinedMacros.insert(MacroName); + }); } SMLoc TGLexer::getLoc() const { @@ -41,11 +66,42 @@ SMLoc TGLexer::getLoc() const { /// ReturnError - Set the error to the specified string at the specified /// location. This is defined to always return tgtok::Error. -tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { +tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) { PrintError(Loc, Msg); return tgtok::Error; } +tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { + return ReturnError(SMLoc::getFromPointer(Loc), Msg); +} + +bool TGLexer::processEOF() { + SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); + if (ParentIncludeLoc != SMLoc()) { + // If prepExitInclude() detects a problem with the preprocessing + // control stack, it will return false. Pretend that we reached + // the final EOF and stop lexing more tokens by returning false + // to LexToken(). + if (!prepExitInclude(false)) + return false; + + CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); + CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); + CurPtr = ParentIncludeLoc.getPointer(); + // Make sure TokStart points into the parent file's buffer. + // LexToken() assigns to it before calling getNextChar(), + // so it is pointing into the included file now. + TokStart = CurPtr; + return true; + } + + // Pretend that we exit the "top-level" include file. + // Note that in case of an error (e.g. control stack imbalance) + // the routine will issue a fatal error. + prepExitInclude(true); + return false; +} + int TGLexer::getNextChar() { char CurChar = *CurPtr++; switch (CurChar) { @@ -57,16 +113,6 @@ int TGLexer::getNextChar() { if (CurPtr-1 != CurBuf.end()) return 0; // Just whitespace. - // If this is the end of an included file, pop the parent file off the - // include stack. - SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); - if (ParentIncludeLoc != SMLoc()) { - CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); - CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); - CurPtr = ParentIncludeLoc.getPointer(); - return getNextChar(); - } - // Otherwise, return end of file. --CurPtr; // Another call to lex will return EOF again. return EOF; @@ -83,11 +129,11 @@ int TGLexer::getNextChar() { } } -int TGLexer::peekNextChar(int Index) { +int TGLexer::peekNextChar(int Index) const { return *(CurPtr + Index); } -tgtok::TokKind TGLexer::LexToken() { +tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) { TokStart = CurPtr; // This always consumes at least one character. int CurChar = getNextChar(); @@ -100,7 +146,18 @@ tgtok::TokKind TGLexer::LexToken() { // Unknown character, emit an error. return ReturnError(TokStart, "Unexpected character"); - case EOF: return tgtok::Eof; + case EOF: + // Lex next token, if we just left an include file. + // Note that leaving an include file means that the next + // symbol is located at the end of 'include "..."' + // construct, so LexToken() is called with default + // false parameter. + if (processEOF()) + return LexToken(); + + // Return EOF denoting the end of lexing. + return tgtok::Eof; + case ':': return tgtok::colon; case ';': return tgtok::semi; case '.': return tgtok::period; @@ -114,15 +171,27 @@ tgtok::TokKind TGLexer::LexToken() { case ')': return tgtok::r_paren; case '=': return tgtok::equal; case '?': return tgtok::question; - case '#': return tgtok::paste; + case '#': + if (FileOrLineStart) { + tgtok::TokKind Kind = prepIsDirective(); + if (Kind != tgtok::Error) + return lexPreprocessor(Kind); + } + + return tgtok::paste; + + case '\r': + PrintFatalError("getNextChar() must never return '\r'"); + return tgtok::Error; case 0: case ' ': case '\t': - case '\n': - case '\r': // Ignore whitespace. - return LexToken(); + return LexToken(FileOrLineStart); + case '\n': + // Ignore whitespace, and identify the new line. + return LexToken(true); case '/': // If this is the start of a // comment, skip until the end of the line or // the end of the buffer. @@ -133,7 +202,7 @@ tgtok::TokKind TGLexer::LexToken() { return tgtok::Error; } else // Otherwise, this is an error. return ReturnError(TokStart, "Unexpected character"); - return LexToken(); + return LexToken(FileOrLineStart); case '-': case '+': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { @@ -249,10 +318,10 @@ tgtok::TokKind TGLexer::LexVarName() { } tgtok::TokKind TGLexer::LexIdentifier() { - // The first letter is [a-zA-Z_#]. + // The first letter is [a-zA-Z_]. const char *IdentStart = TokStart; - // Match the rest of the identifier regex: [0-9a-zA-Z_#]* + // Match the rest of the identifier regex: [0-9a-zA-Z_]* while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') ++CurPtr; @@ -322,6 +391,9 @@ bool TGLexer::LexInclude() { // Save the line number and lex buffer of the includer. CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); CurPtr = CurBuf.begin(); + + PrepIncludeStack.push_back( + make_unique<std::vector<PreprocessorControlDesc>>()); return false; } @@ -496,3 +568,444 @@ tgtok::TokKind TGLexer::LexExclaim() { return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); } + +bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) { + // Report an error, if preprocessor control stack for the current + // file is not empty. + if (!PrepIncludeStack.back()->empty()) { + prepReportPreprocessorStackError(); + + return false; + } + + // Pop the preprocessing controls from the include stack. + if (PrepIncludeStack.empty()) { + PrintFatalError("Preprocessor include stack is empty"); + } + + PrepIncludeStack.pop_back(); + + if (IncludeStackMustBeEmpty) { + if (!PrepIncludeStack.empty()) + PrintFatalError("Preprocessor include stack is not empty"); + } else { + if (PrepIncludeStack.empty()) + PrintFatalError("Preprocessor include stack is empty"); + } + + return true; +} + +tgtok::TokKind TGLexer::prepIsDirective() const { + for (unsigned ID = 0; ID < llvm::array_lengthof(PreprocessorDirs); ++ID) { + int NextChar = *CurPtr; + bool Match = true; + unsigned I = 0; + for (; I < strlen(PreprocessorDirs[ID].Word); ++I) { + if (NextChar != PreprocessorDirs[ID].Word[I]) { + Match = false; + break; + } + + NextChar = peekNextChar(I + 1); + } + + // Check for whitespace after the directive. If there is no whitespace, + // then we do not recognize it as a preprocessing directive. + if (Match) { + tgtok::TokKind Kind = PreprocessorDirs[ID].Kind; + + // New line and EOF may follow only #else/#endif. It will be reported + // as an error for #ifdef/#define after the call to prepLexMacroName(). + if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF || + NextChar == '\n' || + // It looks like TableGen does not support '\r' as the actual + // carriage return, e.g. getNextChar() treats a single '\r' + // as '\n'. So we do the same here. + NextChar == '\r') + return Kind; + + // Allow comments after some directives, e.g.: + // #else// OR #else/**/ + // #endif// OR #endif/**/ + // + // Note that we do allow comments after #ifdef/#define here, e.g. + // #ifdef/**/ AND #ifdef// + // #define/**/ AND #define// + // + // These cases will be reported as incorrect after calling + // prepLexMacroName(). We could have supported C-style comments + // after #ifdef/#define, but this would complicate the code + // for little benefit. + if (NextChar == '/') { + NextChar = peekNextChar(I + 1); + + if (NextChar == '*' || NextChar == '/') + return Kind; + + // Pretend that we do not recognize the directive. + } + } + } + + return tgtok::Error; +} + +bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) { + TokStart = CurPtr; + + for (unsigned ID = 0; ID < llvm::array_lengthof(PreprocessorDirs); ++ID) + if (PreprocessorDirs[ID].Kind == Kind) { + // Advance CurPtr to the end of the preprocessing word. + CurPtr += strlen(PreprocessorDirs[ID].Word); + return true; + } + + PrintFatalError("Unsupported preprocessing token in " + "prepEatPreprocessorDirective()"); + return false; +} + +tgtok::TokKind TGLexer::lexPreprocessor( + tgtok::TokKind Kind, bool ReturnNextLiveToken) { + + // We must be looking at a preprocessing directive. Eat it! + if (!prepEatPreprocessorDirective(Kind)) + PrintFatalError("lexPreprocessor() called for unknown " + "preprocessor directive"); + + if (Kind == tgtok::Ifdef) { + StringRef MacroName = prepLexMacroName(); + if (MacroName.empty()) + return ReturnError(TokStart, "Expected macro name after #ifdef"); + + bool MacroIsDefined = DefinedMacros.count(MacroName) != 0; + + // Regardless of whether we are processing tokens or not, + // we put the #ifdef control on stack. + PrepIncludeStack.back()->push_back( + {Kind, MacroIsDefined, SMLoc::getFromPointer(TokStart)}); + + if (!prepSkipDirectiveEnd()) + return ReturnError(CurPtr, + "Only comments are supported after #ifdef NAME"); + + // If we were not processing tokens before this #ifdef, + // then just return back to the lines skipping code. + if (!ReturnNextLiveToken) + return Kind; + + // If we were processing tokens before this #ifdef, + // and the macro is defined, then just return the next token. + if (MacroIsDefined) + return LexToken(); + + // We were processing tokens before this #ifdef, and the macro + // is not defined, so we have to start skipping the lines. + // If the skipping is successful, it will return the token following + // either #else or #endif corresponding to this #ifdef. + if (prepSkipRegion(ReturnNextLiveToken)) + return LexToken(); + + return tgtok::Error; + } else if (Kind == tgtok::Else) { + // Check if this #else is correct before calling prepSkipDirectiveEnd(), + // which will move CurPtr away from the beginning of #else. + if (PrepIncludeStack.back()->empty()) + return ReturnError(TokStart, "#else without #ifdef"); + + auto &IfdefEntry = PrepIncludeStack.back()->back(); + + if (IfdefEntry.Kind != tgtok::Ifdef) { + PrintError(TokStart, "double #else"); + return ReturnError(IfdefEntry.SrcPos, "Previous #else is here"); + } + + // Replace the corresponding #ifdef's control with its negation + // on the control stack. + PrepIncludeStack.back()->pop_back(); + PrepIncludeStack.back()->push_back( + {Kind, !IfdefEntry.IsDefined, SMLoc::getFromPointer(TokStart)}); + + if (!prepSkipDirectiveEnd()) + return ReturnError(CurPtr, "Only comments are supported after #else"); + + // If we were processing tokens before this #else, + // we have to start skipping lines until the matching #endif. + if (ReturnNextLiveToken) { + if (prepSkipRegion(ReturnNextLiveToken)) + return LexToken(); + + return tgtok::Error; + } + + // Return to the lines skipping code. + return Kind; + } else if (Kind == tgtok::Endif) { + // Check if this #endif is correct before calling prepSkipDirectiveEnd(), + // which will move CurPtr away from the beginning of #endif. + if (PrepIncludeStack.back()->empty()) + return ReturnError(TokStart, "#endif without #ifdef"); + + auto &IfdefOrElseEntry = PrepIncludeStack.back()->back(); + + if (IfdefOrElseEntry.Kind != tgtok::Ifdef && + IfdefOrElseEntry.Kind != tgtok::Else) { + PrintFatalError("Invalid preprocessor control on the stack"); + return tgtok::Error; + } + + if (!prepSkipDirectiveEnd()) + return ReturnError(CurPtr, "Only comments are supported after #endif"); + + PrepIncludeStack.back()->pop_back(); + + // If we were processing tokens before this #endif, then + // we should continue it. + if (ReturnNextLiveToken) { + return LexToken(); + } + + // Return to the lines skipping code. + return Kind; + } else if (Kind == tgtok::Define) { + StringRef MacroName = prepLexMacroName(); + if (MacroName.empty()) + return ReturnError(TokStart, "Expected macro name after #define"); + + if (!DefinedMacros.insert(MacroName).second) + PrintWarning(getLoc(), + "Duplicate definition of macro: " + Twine(MacroName)); + + if (!prepSkipDirectiveEnd()) + return ReturnError(CurPtr, + "Only comments are supported after #define NAME"); + + if (!ReturnNextLiveToken) { + PrintFatalError("#define must be ignored during the lines skipping"); + return tgtok::Error; + } + + return LexToken(); + } + + PrintFatalError("Preprocessing directive is not supported"); + return tgtok::Error; +} + +bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) { + if (!MustNeverBeFalse) + PrintFatalError("Invalid recursion."); + + do { + // Skip all symbols to the line end. + prepSkipToLineEnd(); + + // Find the first non-whitespace symbol in the next line(s). + if (!prepSkipLineBegin()) + return false; + + // If the first non-blank/comment symbol on the line is '#', + // it may be a start of preprocessing directive. + // + // If it is not '#' just go to the next line. + if (*CurPtr == '#') + ++CurPtr; + else + continue; + + tgtok::TokKind Kind = prepIsDirective(); + + // If we did not find a preprocessing directive or it is #define, + // then just skip to the next line. We do not have to do anything + // for #define in the line-skipping mode. + if (Kind == tgtok::Error || Kind == tgtok::Define) + continue; + + tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, false); + + // If lexPreprocessor() encountered an error during lexing this + // preprocessor idiom, then return false to the calling lexPreprocessor(). + // This will force tgtok::Error to be returned to the tokens processing. + if (ProcessedKind == tgtok::Error) + return false; + + if (Kind != ProcessedKind) + PrintFatalError("prepIsDirective() and lexPreprocessor() " + "returned different token kinds"); + + // If this preprocessing directive enables tokens processing, + // then return to the lexPreprocessor() and get to the next token. + // We can move from line-skipping mode to processing tokens only + // due to #else or #endif. + if (prepIsProcessingEnabled()) { + if (Kind != tgtok::Else && Kind != tgtok::Endif) { + PrintFatalError("Tokens processing was enabled by an unexpected " + "preprocessing directive"); + return false; + } + + return true; + } + } while (CurPtr != CurBuf.end()); + + // We have reached the end of the file, but never left the lines-skipping + // mode. This means there is no matching #endif. + prepReportPreprocessorStackError(); + return false; +} + +StringRef TGLexer::prepLexMacroName() { + // Skip whitespaces between the preprocessing directive and the macro name. + while (*CurPtr == ' ' || *CurPtr == '\t') + ++CurPtr; + + TokStart = CurPtr; + // Macro names start with [a-zA-Z_]. + if (*CurPtr != '_' && !isalpha(*CurPtr)) + return ""; + + // Match the rest of the identifier regex: [0-9a-zA-Z_]* + while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') + ++CurPtr; + + return StringRef(TokStart, CurPtr - TokStart); +} + +bool TGLexer::prepSkipLineBegin() { + while (CurPtr != CurBuf.end()) { + switch (*CurPtr) { + case ' ': + case '\t': + case '\n': + case '\r': + break; + + case '/': { + int NextChar = peekNextChar(1); + if (NextChar == '*') { + // Skip C-style comment. + // Note that we do not care about skipping the C++-style comments. + // If the line contains "//", it may not contain any processable + // preprocessing directive. Just return CurPtr pointing to + // the first '/' in this case. We also do not care about + // incorrect symbols after the first '/' - we are in lines-skipping + // mode, so incorrect code is allowed to some extent. + + // Set TokStart to the beginning of the comment to enable proper + // diagnostic printing in case of error in SkipCComment(). + TokStart = CurPtr; + + // CurPtr must point to '*' before call to SkipCComment(). + ++CurPtr; + if (SkipCComment()) + return false; + } else { + // CurPtr points to the non-whitespace '/'. + return true; + } + + // We must not increment CurPtr after the comment was lexed. + continue; + } + + default: + return true; + } + + ++CurPtr; + } + + // We have reached the end of the file. Return to the lines skipping + // code, and allow it to handle the EOF as needed. + return true; +} + +bool TGLexer::prepSkipDirectiveEnd() { + while (CurPtr != CurBuf.end()) { + switch (*CurPtr) { + case ' ': + case '\t': + break; + + case '\n': + case '\r': + return true; + + case '/': { + int NextChar = peekNextChar(1); + if (NextChar == '/') { + // Skip C++-style comment. + // We may just return true now, but let's skip to the line/buffer end + // to simplify the method specification. + ++CurPtr; + SkipBCPLComment(); + } else if (NextChar == '*') { + // When we are skipping C-style comment at the end of a preprocessing + // directive, we can skip several lines. If any meaningful TD token + // follows the end of the C-style comment on the same line, it will + // be considered as an invalid usage of TD token. + // For example, we want to forbid usages like this one: + // #define MACRO class Class {} + // But with C-style comments we also disallow the following: + // #define MACRO /* This macro is used + // to ... */ class Class {} + // One can argue that this should be allowed, but it does not seem + // to be worth of the complication. Moreover, this matches + // the C preprocessor behavior. + + // Set TokStart to the beginning of the comment to enable proper + // diagnostic printer in case of error in SkipCComment(). + TokStart = CurPtr; + ++CurPtr; + if (SkipCComment()) + return false; + } else { + TokStart = CurPtr; + PrintError(CurPtr, "Unexpected character"); + return false; + } + + // We must not increment CurPtr after the comment was lexed. + continue; + } + + default: + // Do not allow any non-whitespaces after the directive. + TokStart = CurPtr; + return false; + } + + ++CurPtr; + } + + return true; +} + +void TGLexer::prepSkipToLineEnd() { + while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) + ++CurPtr; +} + +bool TGLexer::prepIsProcessingEnabled() { + for (auto I = PrepIncludeStack.back()->rbegin(), + E = PrepIncludeStack.back()->rend(); + I != E; ++I) { + if (!I->IsDefined) + return false; + } + + return true; +} + +void TGLexer::prepReportPreprocessorStackError() { + if (PrepIncludeStack.back()->empty()) + PrintFatalError("prepReportPreprocessorStackError() called with " + "empty control stack"); + + auto &PrepControl = PrepIncludeStack.back()->back(); + PrintError(CurBuf.end(), "Reached EOF without matching #endif"); + PrintError(PrepControl.SrcPos, "The latest preprocessor control is here"); + + TokStart = CurPtr; +} |