diff options
Diffstat (limited to 'clang/lib/Lex')
-rw-r--r-- | clang/lib/Lex/HeaderMap.cpp | 242 | ||||
-rw-r--r-- | clang/lib/Lex/HeaderSearch.cpp | 425 | ||||
-rw-r--r-- | clang/lib/Lex/Lexer.cpp | 1661 | ||||
-rw-r--r-- | clang/lib/Lex/LiteralSupport.cpp | 691 | ||||
-rw-r--r-- | clang/lib/Lex/MacroArgs.cpp | 225 | ||||
-rw-r--r-- | clang/lib/Lex/MacroArgs.h | 109 | ||||
-rw-r--r-- | clang/lib/Lex/MacroInfo.cpp | 70 | ||||
-rw-r--r-- | clang/lib/Lex/Makefile | 28 | ||||
-rw-r--r-- | clang/lib/Lex/PPDirectives.cpp | 1153 | ||||
-rw-r--r-- | clang/lib/Lex/PPExpressions.cpp | 639 | ||||
-rw-r--r-- | clang/lib/Lex/PPLexerChange.cpp | 401 | ||||
-rw-r--r-- | clang/lib/Lex/PPMacroExpansion.cpp | 523 | ||||
-rw-r--r-- | clang/lib/Lex/Pragma.cpp | 386 | ||||
-rw-r--r-- | clang/lib/Lex/Preprocessor.cpp | 560 | ||||
-rw-r--r-- | clang/lib/Lex/ScratchBuffer.cpp | 72 | ||||
-rw-r--r-- | clang/lib/Lex/TokenLexer.cpp | 488 |
16 files changed, 7673 insertions, 0 deletions
diff --git a/clang/lib/Lex/HeaderMap.cpp b/clang/lib/Lex/HeaderMap.cpp new file mode 100644 index 00000000000..282e742b4c8 --- /dev/null +++ b/clang/lib/Lex/HeaderMap.cpp @@ -0,0 +1,242 @@ +//===--- HeaderMap.cpp - A file that acts like dir of symlinks ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the HeaderMap interface. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/HeaderMap.h" +#include "clang/Basic/FileManager.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/DataTypes.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/MemoryBuffer.h" +using namespace clang; + +//===----------------------------------------------------------------------===// +// Data Structures and Manifest Constants +//===----------------------------------------------------------------------===// + +enum { + HMAP_HeaderMagicNumber = ('h' << 24) | ('m' << 16) | ('a' << 8) | 'p', + HMAP_HeaderVersion = 1, + + HMAP_EmptyBucketKey = 0 +}; + +namespace clang { +struct HMapBucket { + uint32_t Key; // Offset (into strings) of key. + + uint32_t Prefix; // Offset (into strings) of value prefix. + uint32_t Suffix; // Offset (into strings) of value suffix. +}; + +struct HMapHeader { + uint32_t Magic; // Magic word, also indicates byte order. + uint16_t Version; // Version number -- currently 1. + uint16_t Reserved; // Reserved for future use - zero for now. + uint32_t StringsOffset; // Offset to start of string pool. + uint32_t NumEntries; // Number of entries in the string table. + uint32_t NumBuckets; // Number of buckets (always a power of 2). + uint32_t MaxValueLength; // Length of longest result path (excluding nul). + // An array of 'NumBuckets' HMapBucket objects follows this header. + // Strings follow the buckets, at StringsOffset. +}; +} // end namespace clang. + +/// HashHMapKey - This is the 'well known' hash function required by the file +/// format, used to look up keys in the hash table. The hash table uses simple +/// linear probing based on this function. +static inline unsigned HashHMapKey(const char *S, const char *End) { + unsigned Result = 0; + + for (; S != End; S++) + Result += tolower(*S) * 13; + return Result; +} + + + +//===----------------------------------------------------------------------===// +// Verification and Construction +//===----------------------------------------------------------------------===// + +/// HeaderMap::Create - This attempts to load the specified file as a header +/// map. If it doesn't look like a HeaderMap, it gives up and returns null. +/// If it looks like a HeaderMap but is obviously corrupted, it puts a reason +/// into the string error argument and returns null. +const HeaderMap *HeaderMap::Create(const FileEntry *FE) { + // If the file is too small to be a header map, ignore it. + unsigned FileSize = FE->getSize(); + if (FileSize <= sizeof(HMapHeader)) return 0; + + llvm::OwningPtr<const llvm::MemoryBuffer> FileBuffer( + llvm::MemoryBuffer::getFile(FE->getName(), strlen(FE->getName()), 0, + FE->getSize())); + if (FileBuffer == 0) return 0; // Unreadable file? + const char *FileStart = FileBuffer->getBufferStart(); + + // We know the file is at least as big as the header, check it now. + const HMapHeader *Header = reinterpret_cast<const HMapHeader*>(FileStart); + + // Sniff it to see if it's a headermap by checking the magic number and + // version. + bool NeedsByteSwap; + if (Header->Magic == HMAP_HeaderMagicNumber && + Header->Version == HMAP_HeaderVersion) + NeedsByteSwap = false; + else if (Header->Magic == llvm::ByteSwap_32(HMAP_HeaderMagicNumber) && + Header->Version == llvm::ByteSwap_16(HMAP_HeaderVersion)) + NeedsByteSwap = true; // Mixed endianness headermap. + else + return 0; // Not a header map. + + if (Header->Reserved != 0) return 0; + + // Okay, everything looks good, create the header map. + return new HeaderMap(FileBuffer.take(), NeedsByteSwap); +} + +HeaderMap::~HeaderMap() { + delete FileBuffer; +} + +//===----------------------------------------------------------------------===// +// Utility Methods +//===----------------------------------------------------------------------===// + + +/// getFileName - Return the filename of the headermap. +const char *HeaderMap::getFileName() const { + return FileBuffer->getBufferIdentifier(); +} + +unsigned HeaderMap::getEndianAdjustedWord(unsigned X) const { + if (!NeedsBSwap) return X; + return llvm::ByteSwap_32(X); +} + +/// getHeader - Return a reference to the file header, in unbyte-swapped form. +/// This method cannot fail. +const HMapHeader &HeaderMap::getHeader() const { + // We know the file is at least as big as the header. Return it. + return *reinterpret_cast<const HMapHeader*>(FileBuffer->getBufferStart()); +} + +/// getBucket - Return the specified hash table bucket from the header map, +/// bswap'ing its fields as appropriate. If the bucket number is not valid, +/// this return a bucket with an empty key (0). +HMapBucket HeaderMap::getBucket(unsigned BucketNo) const { + HMapBucket Result; + Result.Key = HMAP_EmptyBucketKey; + + const HMapBucket *BucketArray = + reinterpret_cast<const HMapBucket*>(FileBuffer->getBufferStart() + + sizeof(HMapHeader)); + + const HMapBucket *BucketPtr = BucketArray+BucketNo; + if ((char*)(BucketPtr+1) > FileBuffer->getBufferEnd()) + return Result; // Invalid buffer, corrupt hmap. + + // Otherwise, the bucket is valid. Load the values, bswapping as needed. + Result.Key = getEndianAdjustedWord(BucketPtr->Key); + Result.Prefix = getEndianAdjustedWord(BucketPtr->Prefix); + Result.Suffix = getEndianAdjustedWord(BucketPtr->Suffix); + return Result; +} + +/// getString - Look up the specified string in the string table. If the string +/// index is not valid, it returns an empty string. +const char *HeaderMap::getString(unsigned StrTabIdx) const { + // Add the start of the string table to the idx. + StrTabIdx += getEndianAdjustedWord(getHeader().StringsOffset); + + // Check for invalid index. + if (StrTabIdx >= FileBuffer->getBufferSize()) + return 0; + + // Otherwise, we have a valid pointer into the file. Just return it. We know + // that the "string" can not overrun the end of the file, because the buffer + // is nul terminated by virtue of being a MemoryBuffer. + return FileBuffer->getBufferStart()+StrTabIdx; +} + +/// StringsEqualWithoutCase - Compare the specified two strings for case- +/// insensitive equality, returning true if they are equal. Both strings are +/// known to have the same length. +static bool StringsEqualWithoutCase(const char *S1, const char *S2, + unsigned Len) { + for (; Len; ++S1, ++S2, --Len) + if (tolower(*S1) != tolower(*S2)) + return false; + return true; +} + +//===----------------------------------------------------------------------===// +// The Main Drivers +//===----------------------------------------------------------------------===// + +/// dump - Print the contents of this headermap to stderr. +void HeaderMap::dump() const { + const HMapHeader &Hdr = getHeader(); + unsigned NumBuckets = getEndianAdjustedWord(Hdr.NumBuckets); + + fprintf(stderr, "Header Map %s:\n %d buckets, %d entries\n", + getFileName(), NumBuckets, + getEndianAdjustedWord(Hdr.NumEntries)); + + for (unsigned i = 0; i != NumBuckets; ++i) { + HMapBucket B = getBucket(i); + if (B.Key == HMAP_EmptyBucketKey) continue; + + const char *Key = getString(B.Key); + const char *Prefix = getString(B.Prefix); + const char *Suffix = getString(B.Suffix); + fprintf(stderr, " %d. %s -> '%s' '%s'\n", i, Key, Prefix, Suffix); + } +} + +/// LookupFile - Check to see if the specified relative filename is located in +/// this HeaderMap. If so, open it and return its FileEntry. +const FileEntry *HeaderMap::LookupFile(const char *FilenameStart, + const char *FilenameEnd, + FileManager &FM) const { + const HMapHeader &Hdr = getHeader(); + unsigned NumBuckets = getEndianAdjustedWord(Hdr.NumBuckets); + + // If the number of buckets is not a power of two, the headermap is corrupt. + // Don't probe infinitely. + if (NumBuckets & (NumBuckets-1)) + return 0; + + // Linearly probe the hash table. + for (unsigned Bucket = HashHMapKey(FilenameStart, FilenameEnd);; ++Bucket) { + HMapBucket B = getBucket(Bucket & (NumBuckets-1)); + if (B.Key == HMAP_EmptyBucketKey) return 0; // Hash miss. + + // See if the key matches. If not, probe on. + const char *Key = getString(B.Key); + unsigned BucketKeyLen = strlen(Key); + if (BucketKeyLen != unsigned(FilenameEnd-FilenameStart)) + continue; + + // See if the actual strings equal. + if (!StringsEqualWithoutCase(FilenameStart, Key, BucketKeyLen)) + continue; + + // If so, we have a match in the hash table. Construct the destination + // path. + llvm::SmallString<1024> DestPath; + DestPath += getString(B.Prefix); + DestPath += getString(B.Suffix); + return FM.getFile(DestPath.begin(), DestPath.end()); + } +} diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp new file mode 100644 index 00000000000..44ae35c8b7e --- /dev/null +++ b/clang/lib/Lex/HeaderSearch.cpp @@ -0,0 +1,425 @@ +//===--- HeaderSearch.cpp - Resolve Header File Locations ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the DirectoryLookup and HeaderSearch interfaces. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/HeaderMap.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/IdentifierTable.h" +#include "llvm/System/Path.h" +#include "llvm/ADT/SmallString.h" +using namespace clang; + +HeaderSearch::HeaderSearch(FileManager &FM) : FileMgr(FM), FrameworkMap(64) { + SystemDirIdx = 0; + NoCurDirSearch = false; + + NumIncluded = 0; + NumMultiIncludeFileOptzn = 0; + NumFrameworkLookups = NumSubFrameworkLookups = 0; +} + +HeaderSearch::~HeaderSearch() { + // Delete headermaps. + for (unsigned i = 0, e = HeaderMaps.size(); i != e; ++i) + delete HeaderMaps[i].second; +} + +void HeaderSearch::PrintStats() { + fprintf(stderr, "\n*** HeaderSearch Stats:\n"); + fprintf(stderr, "%d files tracked.\n", (int)FileInfo.size()); + unsigned NumOnceOnlyFiles = 0, MaxNumIncludes = 0, NumSingleIncludedFiles = 0; + for (unsigned i = 0, e = FileInfo.size(); i != e; ++i) { + NumOnceOnlyFiles += FileInfo[i].isImport; + if (MaxNumIncludes < FileInfo[i].NumIncludes) + MaxNumIncludes = FileInfo[i].NumIncludes; + NumSingleIncludedFiles += FileInfo[i].NumIncludes == 1; + } + fprintf(stderr, " %d #import/#pragma once files.\n", NumOnceOnlyFiles); + fprintf(stderr, " %d included exactly once.\n", NumSingleIncludedFiles); + fprintf(stderr, " %d max times a file is included.\n", MaxNumIncludes); + + fprintf(stderr, " %d #include/#include_next/#import.\n", NumIncluded); + fprintf(stderr, " %d #includes skipped due to" + " the multi-include optimization.\n", NumMultiIncludeFileOptzn); + + fprintf(stderr, "%d framework lookups.\n", NumFrameworkLookups); + fprintf(stderr, "%d subframework lookups.\n", NumSubFrameworkLookups); +} + +/// CreateHeaderMap - This method returns a HeaderMap for the specified +/// FileEntry, uniquing them through the the 'HeaderMaps' datastructure. +const HeaderMap *HeaderSearch::CreateHeaderMap(const FileEntry *FE) { + // We expect the number of headermaps to be small, and almost always empty. + // If it ever grows, use of a linear search should be re-evaluated. + if (!HeaderMaps.empty()) { + for (unsigned i = 0, e = HeaderMaps.size(); i != e; ++i) + // Pointer equality comparison of FileEntries works because they are + // already uniqued by inode. + if (HeaderMaps[i].first == FE) + return HeaderMaps[i].second; + } + + if (const HeaderMap *HM = HeaderMap::Create(FE)) { + HeaderMaps.push_back(std::make_pair(FE, HM)); + return HM; + } + + return 0; +} + +//===----------------------------------------------------------------------===// +// File lookup within a DirectoryLookup scope +//===----------------------------------------------------------------------===// + +/// getName - Return the directory or filename corresponding to this lookup +/// object. +const char *DirectoryLookup::getName() const { + if (isNormalDir()) + return getDir()->getName(); + if (isFramework()) + return getFrameworkDir()->getName(); + assert(isHeaderMap() && "Unknown DirectoryLookup"); + return getHeaderMap()->getFileName(); +} + + +/// LookupFile - Lookup the specified file in this search path, returning it +/// if it exists or returning null if not. +const FileEntry *DirectoryLookup::LookupFile(const char *FilenameStart, + const char *FilenameEnd, + HeaderSearch &HS) const { + llvm::SmallString<1024> TmpDir; + if (isNormalDir()) { + // Concatenate the requested file onto the directory. + // FIXME: Portability. Filename concatenation should be in sys::Path. + TmpDir += getDir()->getName(); + TmpDir.push_back('/'); + TmpDir.append(FilenameStart, FilenameEnd); + return HS.getFileMgr().getFile(TmpDir.begin(), TmpDir.end()); + } + + if (isFramework()) + return DoFrameworkLookup(FilenameStart, FilenameEnd, HS); + + assert(isHeaderMap() && "Unknown directory lookup"); + return getHeaderMap()->LookupFile(FilenameStart, FilenameEnd,HS.getFileMgr()); +} + + +/// DoFrameworkLookup - Do a lookup of the specified file in the current +/// DirectoryLookup, which is a framework directory. +const FileEntry *DirectoryLookup::DoFrameworkLookup(const char *FilenameStart, + const char *FilenameEnd, + HeaderSearch &HS) const { + FileManager &FileMgr = HS.getFileMgr(); + + // Framework names must have a '/' in the filename. + const char *SlashPos = std::find(FilenameStart, FilenameEnd, '/'); + if (SlashPos == FilenameEnd) return 0; + + // Find out if this is the home for the specified framework, by checking + // HeaderSearch. Possible answer are yes/no and unknown. + const DirectoryEntry *&FrameworkDirCache = + HS.LookupFrameworkCache(FilenameStart, SlashPos); + + // If it is known and in some other directory, fail. + if (FrameworkDirCache && FrameworkDirCache != getFrameworkDir()) + return 0; + + // Otherwise, construct the path to this framework dir. + + // FrameworkName = "/System/Library/Frameworks/" + llvm::SmallString<1024> FrameworkName; + FrameworkName += getFrameworkDir()->getName(); + if (FrameworkName.empty() || FrameworkName.back() != '/') + FrameworkName.push_back('/'); + + // FrameworkName = "/System/Library/Frameworks/Cocoa" + FrameworkName.append(FilenameStart, SlashPos); + + // FrameworkName = "/System/Library/Frameworks/Cocoa.framework/" + FrameworkName += ".framework/"; + + // If the cache entry is still unresolved, query to see if the cache entry is + // still unresolved. If so, check its existence now. + if (FrameworkDirCache == 0) { + HS.IncrementFrameworkLookupCount(); + + // If the framework dir doesn't exist, we fail. + // FIXME: It's probably more efficient to query this with FileMgr.getDir. + if (!llvm::sys::Path(std::string(FrameworkName.begin(), + FrameworkName.end())).exists()) + return 0; + + // Otherwise, if it does, remember that this is the right direntry for this + // framework. + FrameworkDirCache = getFrameworkDir(); + } + + // Check "/System/Library/Frameworks/Cocoa.framework/Headers/file.h" + unsigned OrigSize = FrameworkName.size(); + + FrameworkName += "Headers/"; + FrameworkName.append(SlashPos+1, FilenameEnd); + if (const FileEntry *FE = FileMgr.getFile(FrameworkName.begin(), + FrameworkName.end())) { + return FE; + } + + // Check "/System/Library/Frameworks/Cocoa.framework/PrivateHeaders/file.h" + const char *Private = "Private"; + FrameworkName.insert(FrameworkName.begin()+OrigSize, Private, + Private+strlen(Private)); + return FileMgr.getFile(FrameworkName.begin(), FrameworkName.end()); +} + + +//===----------------------------------------------------------------------===// +// Header File Location. +//===----------------------------------------------------------------------===// + + +/// LookupFile - Given a "foo" or <foo> reference, look up the indicated file, +/// return null on failure. isAngled indicates whether the file reference is +/// for system #include's or not (i.e. using <> instead of ""). CurFileEnt, if +/// non-null, indicates where the #including file is, in case a relative search +/// is needed. +const FileEntry *HeaderSearch::LookupFile(const char *FilenameStart, + const char *FilenameEnd, + bool isAngled, + const DirectoryLookup *FromDir, + const DirectoryLookup *&CurDir, + const FileEntry *CurFileEnt) { + // If 'Filename' is absolute, check to see if it exists and no searching. + // FIXME: Portability. This should be a sys::Path interface, this doesn't + // handle things like C:\foo.txt right, nor win32 \\network\device\blah. + if (FilenameStart[0] == '/') { + CurDir = 0; + + // If this was an #include_next "/absolute/file", fail. + if (FromDir) return 0; + + // Otherwise, just return the file. + return FileMgr.getFile(FilenameStart, FilenameEnd); + } + + // Step #0, unless disabled, check to see if the file is in the #includer's + // directory. This has to be based on CurFileEnt, not CurDir, because + // CurFileEnt could be a #include of a subdirectory (#include "foo/bar.h") and + // a subsequent include of "baz.h" should resolve to "whatever/foo/baz.h". + // This search is not done for <> headers. + if (CurFileEnt && !isAngled && !NoCurDirSearch) { + llvm::SmallString<1024> TmpDir; + // Concatenate the requested file onto the directory. + // FIXME: Portability. Filename concatenation should be in sys::Path. + TmpDir += CurFileEnt->getDir()->getName(); + TmpDir.push_back('/'); + TmpDir.append(FilenameStart, FilenameEnd); + if (const FileEntry *FE = FileMgr.getFile(TmpDir.begin(), TmpDir.end())) { + // Leave CurDir unset. + // This file is a system header or C++ unfriendly if the old file is. + // + // Note that the temporary 'DirInfo' is required here, as either call to + // getFileInfo could resize the vector and we don't want to rely on order + // of evaluation. + unsigned DirInfo = getFileInfo(CurFileEnt).DirInfo; + getFileInfo(FE).DirInfo = DirInfo; + return FE; + } + } + + CurDir = 0; + + // If this is a system #include, ignore the user #include locs. + unsigned i = isAngled ? SystemDirIdx : 0; + + // If this is a #include_next request, start searching after the directory the + // file was found in. + if (FromDir) + i = FromDir-&SearchDirs[0]; + + // Cache all of the lookups performed by this method. Many headers are + // multiply included, and the "pragma once" optimization prevents them from + // being relex/pp'd, but they would still have to search through a + // (potentially huge) series of SearchDirs to find it. + std::pair<unsigned, unsigned> &CacheLookup = + LookupFileCache.GetOrCreateValue(FilenameStart, FilenameEnd).getValue(); + + // If the entry has been previously looked up, the first value will be + // non-zero. If the value is equal to i (the start point of our search), then + // this is a matching hit. + if (CacheLookup.first == i+1) { + // Skip querying potentially lots of directories for this lookup. + i = CacheLookup.second; + } else { + // Otherwise, this is the first query, or the previous query didn't match + // our search start. We will fill in our found location below, so prime the + // start point value. + CacheLookup.first = i+1; + } + + // Check each directory in sequence to see if it contains this file. + for (; i != SearchDirs.size(); ++i) { + const FileEntry *FE = + SearchDirs[i].LookupFile(FilenameStart, FilenameEnd, *this); + if (!FE) continue; + + CurDir = &SearchDirs[i]; + + // This file is a system header or C++ unfriendly if the dir is. + getFileInfo(FE).DirInfo = CurDir->getDirCharacteristic(); + + // Remember this location for the next lookup we do. + CacheLookup.second = i; + return FE; + } + + // Otherwise, didn't find it. Remember we didn't find this. + CacheLookup.second = SearchDirs.size(); + return 0; +} + +/// LookupSubframeworkHeader - Look up a subframework for the specified +/// #include file. For example, if #include'ing <HIToolbox/HIToolbox.h> from +/// within ".../Carbon.framework/Headers/Carbon.h", check to see if HIToolbox +/// is a subframework within Carbon.framework. If so, return the FileEntry +/// for the designated file, otherwise return null. +const FileEntry *HeaderSearch:: +LookupSubframeworkHeader(const char *FilenameStart, + const char *FilenameEnd, + const FileEntry *ContextFileEnt) { + assert(ContextFileEnt && "No context file?"); + + // Framework names must have a '/' in the filename. Find it. + const char *SlashPos = std::find(FilenameStart, FilenameEnd, '/'); + if (SlashPos == FilenameEnd) return 0; + + // Look up the base framework name of the ContextFileEnt. + const char *ContextName = ContextFileEnt->getName(); + + // If the context info wasn't a framework, couldn't be a subframework. + const char *FrameworkPos = strstr(ContextName, ".framework/"); + if (FrameworkPos == 0) + return 0; + + llvm::SmallString<1024> FrameworkName(ContextName, + FrameworkPos+strlen(".framework/")); + + // Append Frameworks/HIToolbox.framework/ + FrameworkName += "Frameworks/"; + FrameworkName.append(FilenameStart, SlashPos); + FrameworkName += ".framework/"; + + llvm::StringMapEntry<const DirectoryEntry *> &CacheLookup = + FrameworkMap.GetOrCreateValue(FilenameStart, SlashPos); + + // Some other location? + if (CacheLookup.getValue() && + CacheLookup.getKeyLength() == FrameworkName.size() && + memcmp(CacheLookup.getKeyData(), &FrameworkName[0], + CacheLookup.getKeyLength()) != 0) + return 0; + + // Cache subframework. + if (CacheLookup.getValue() == 0) { + ++NumSubFrameworkLookups; + + // If the framework dir doesn't exist, we fail. + const DirectoryEntry *Dir = FileMgr.getDirectory(FrameworkName.begin(), + FrameworkName.end()); + if (Dir == 0) return 0; + + // Otherwise, if it does, remember that this is the right direntry for this + // framework. + CacheLookup.setValue(Dir); + } + + const FileEntry *FE = 0; + + // Check ".../Frameworks/HIToolbox.framework/Headers/HIToolbox.h" + llvm::SmallString<1024> HeadersFilename(FrameworkName); + HeadersFilename += "Headers/"; + HeadersFilename.append(SlashPos+1, FilenameEnd); + if (!(FE = FileMgr.getFile(HeadersFilename.begin(), + HeadersFilename.end()))) { + + // Check ".../Frameworks/HIToolbox.framework/PrivateHeaders/HIToolbox.h" + HeadersFilename = FrameworkName; + HeadersFilename += "PrivateHeaders/"; + HeadersFilename.append(SlashPos+1, FilenameEnd); + if (!(FE = FileMgr.getFile(HeadersFilename.begin(), HeadersFilename.end()))) + return 0; + } + + // This file is a system header or C++ unfriendly if the old file is. + // + // Note that the temporary 'DirInfo' is required here, as either call to + // getFileInfo could resize the vector and we don't want to rely on order + // of evaluation. + unsigned DirInfo = getFileInfo(ContextFileEnt).DirInfo; + getFileInfo(FE).DirInfo = DirInfo; + return FE; +} + +//===----------------------------------------------------------------------===// +// File Info Management. +//===----------------------------------------------------------------------===// + + +/// getFileInfo - Return the PerFileInfo structure for the specified +/// FileEntry. +HeaderSearch::PerFileInfo &HeaderSearch::getFileInfo(const FileEntry *FE) { + if (FE->getUID() >= FileInfo.size()) + FileInfo.resize(FE->getUID()+1); + return FileInfo[FE->getUID()]; +} + +/// ShouldEnterIncludeFile - Mark the specified file as a target of of a +/// #include, #include_next, or #import directive. Return false if #including +/// the file will have no effect or true if we should include it. +bool HeaderSearch::ShouldEnterIncludeFile(const FileEntry *File, bool isImport){ + ++NumIncluded; // Count # of attempted #includes. + + // Get information about this file. + PerFileInfo &FileInfo = getFileInfo(File); + + // If this is a #import directive, check that we have not already imported + // this header. + if (isImport) { + // If this has already been imported, don't import it again. + FileInfo.isImport = true; + + // Has this already been #import'ed or #include'd? + if (FileInfo.NumIncludes) return false; + } else { + // Otherwise, if this is a #include of a file that was previously #import'd + // or if this is the second #include of a #pragma once file, ignore it. + if (FileInfo.isImport) + return false; + } + + // Next, check to see if the file is wrapped with #ifndef guards. If so, and + // if the macro that guards it is defined, we know the #include has no effect. + if (FileInfo.ControllingMacro && + FileInfo.ControllingMacro->hasMacroDefinition()) { + ++NumMultiIncludeFileOptzn; + return false; + } + + // Increment the number of times this file has been included. + ++FileInfo.NumIncludes; + + return true; +} + + diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp new file mode 100644 index 00000000000..98bbb386305 --- /dev/null +++ b/clang/lib/Lex/Lexer.cpp @@ -0,0 +1,1661 @@ +//===--- Lexer.cpp - C Language Family Lexer ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Lexer and Token interfaces. +// +//===----------------------------------------------------------------------===// +// +// TODO: GCC Diagnostics emitted by the lexer: +// PEDWARN: (form feed|vertical tab) in preprocessing directive +// +// Universal characters, unicode, char mapping: +// WARNING: `%.*s' is not in NFKC +// WARNING: `%.*s' is not in NFC +// +// Other: +// TODO: Options to support: +// -fexec-charset,-fwide-exec-charset +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/Lexer.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/SourceManager.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/MemoryBuffer.h" +#include <cctype> +using namespace clang; + +static void InitCharacterInfo(); + +//===----------------------------------------------------------------------===// +// Token Class Implementation +//===----------------------------------------------------------------------===// + +/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. +bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { + return is(tok::identifier) && + getIdentifierInfo()->getObjCKeywordID() == objcKey; +} + +/// getObjCKeywordID - Return the ObjC keyword kind. +tok::ObjCKeywordKind Token::getObjCKeywordID() const { + IdentifierInfo *specId = getIdentifierInfo(); + return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; +} + +/// isNamedIdentifier - Return true if this token is a ppidentifier with the +/// specified name. For example, tok.isNamedIdentifier("this"). +bool Token::isNamedIdentifier(const char *Name) const { + return IdentInfo && !strcmp(IdentInfo->getName(), Name); +} + + +//===----------------------------------------------------------------------===// +// Lexer Class Implementation +//===----------------------------------------------------------------------===// + + +/// Lexer constructor - Create a new lexer object for the specified buffer +/// with the specified preprocessor managing the lexing process. This lexer +/// assumes that the associated file buffer and Preprocessor objects will +/// outlive it, so it doesn't take ownership of either of them. +Lexer::Lexer(SourceLocation fileloc, Preprocessor &pp, + const char *BufStart, const char *BufEnd) + : FileLoc(fileloc), PP(&pp), Features(pp.getLangOptions()) { + + SourceManager &SourceMgr = PP->getSourceManager(); + unsigned InputFileID = SourceMgr.getPhysicalLoc(FileLoc).getFileID(); + const llvm::MemoryBuffer *InputFile = SourceMgr.getBuffer(InputFileID); + + Is_PragmaLexer = false; + InitCharacterInfo(); + + // BufferStart must always be InputFile->getBufferStart(). + BufferStart = InputFile->getBufferStart(); + + // BufferPtr and BufferEnd can start out somewhere inside the current buffer. + // If unspecified, they starts at the start/end of the buffer. + BufferPtr = BufStart ? BufStart : BufferStart; + BufferEnd = BufEnd ? BufEnd : InputFile->getBufferEnd(); + + assert(BufferEnd[0] == 0 && + "We assume that the input buffer has a null character at the end" + " to simplify lexing!"); + + // Start of the file is a start of line. + IsAtStartOfLine = true; + + // We are not after parsing a #. + ParsingPreprocessorDirective = false; + + // We are not after parsing #include. + ParsingFilename = false; + + // We are not in raw mode. Raw mode disables diagnostics and interpretation + // of tokens (e.g. identifiers, thus disabling macro expansion). It is used + // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block + // or otherwise skipping over tokens. + LexingRawMode = false; + + // Default to keeping comments if requested. + KeepCommentMode = PP->getCommentRetentionState(); +} + +/// Lexer constructor - Create a new raw lexer object. This object is only +/// suitable for calls to 'LexRawToken'. This lexer assumes that the +/// associated file buffer will outlive it, so it doesn't take ownership of +/// either of them. +Lexer::Lexer(SourceLocation fileloc, const LangOptions &features, + const char *BufStart, const char *BufEnd) + : FileLoc(fileloc), PP(0), Features(features) { + Is_PragmaLexer = false; + InitCharacterInfo(); + + BufferStart = BufStart; + BufferPtr = BufStart; + BufferEnd = BufEnd; + + assert(BufferEnd[0] == 0 && + "We assume that the input buffer has a null character at the end" + " to simplify lexing!"); + + // Start of the file is a start of line. + IsAtStartOfLine = true; + + // We are not after parsing a #. + ParsingPreprocessorDirective = false; + + // We are not after parsing #include. + ParsingFilename = false; + + // We *are* in raw mode. + LexingRawMode = true; + + // Never keep comments in raw mode. + KeepCommentMode = false; +} + + +/// Stringify - Convert the specified string into a C string, with surrounding +/// ""'s, and with escaped \ and " characters. +std::string Lexer::Stringify(const std::string &Str, bool Charify) { + std::string Result = Str; + char Quote = Charify ? '\'' : '"'; + for (unsigned i = 0, e = Result.size(); i != e; ++i) { + if (Result[i] == '\\' || Result[i] == Quote) { + Result.insert(Result.begin()+i, '\\'); + ++i; ++e; + } + } + return Result; +} + +/// Stringify - Convert the specified string into a C string by escaping '\' +/// and " characters. This does not add surrounding ""'s to the string. +void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) { + for (unsigned i = 0, e = Str.size(); i != e; ++i) { + if (Str[i] == '\\' || Str[i] == '"') { + Str.insert(Str.begin()+i, '\\'); + ++i; ++e; + } + } +} + + +/// MeasureTokenLength - Relex the token at the specified location and return +/// its length in bytes in the input file. If the token needs cleaning (e.g. +/// includes a trigraph or an escaped newline) then this count includes bytes +/// that are part of that. +unsigned Lexer::MeasureTokenLength(SourceLocation Loc, + const SourceManager &SM) { + // If this comes from a macro expansion, we really do want the macro name, not + // the token this macro expanded to. + Loc = SM.getLogicalLoc(Loc); + + const char *StrData = SM.getCharacterData(Loc); + + // TODO: this could be special cased for common tokens like identifiers, ')', + // etc to make this faster, if it mattered. Just look at StrData[0] to handle + // all obviously single-char tokens. This could use + // Lexer::isObviouslySimpleCharacter for example to handle identifiers or + // something. + + + const char *BufEnd = SM.getBufferData(Loc.getFileID()).second; + + // Create a langops struct and enable trigraphs. This is sufficient for + // measuring tokens. + LangOptions LangOpts; + LangOpts.Trigraphs = true; + + // Create a lexer starting at the beginning of this token. + Lexer TheLexer(Loc, LangOpts, StrData, BufEnd); + Token TheTok; + TheLexer.LexRawToken(TheTok); + return TheTok.getLength(); +} + +//===----------------------------------------------------------------------===// +// Character information. +//===----------------------------------------------------------------------===// + +static unsigned char CharInfo[256]; + +enum { + CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' + CHAR_VERT_WS = 0x02, // '\r', '\n' + CHAR_LETTER = 0x04, // a-z,A-Z + CHAR_NUMBER = 0x08, // 0-9 + CHAR_UNDER = 0x10, // _ + CHAR_PERIOD = 0x20 // . +}; + +static void InitCharacterInfo() { + static bool isInited = false; + if (isInited) return; + isInited = true; + + // Intiialize the CharInfo table. + // TODO: statically initialize this. + CharInfo[(int)' '] = CharInfo[(int)'\t'] = + CharInfo[(int)'\f'] = CharInfo[(int)'\v'] = CHAR_HORZ_WS; + CharInfo[(int)'\n'] = CharInfo[(int)'\r'] = CHAR_VERT_WS; + + CharInfo[(int)'_'] = CHAR_UNDER; + CharInfo[(int)'.'] = CHAR_PERIOD; + for (unsigned i = 'a'; i <= 'z'; ++i) + CharInfo[i] = CharInfo[i+'A'-'a'] = CHAR_LETTER; + for (unsigned i = '0'; i <= '9'; ++i) + CharInfo[i] = CHAR_NUMBER; +} + +/// isIdentifierBody - Return true if this is the body character of an +/// identifier, which is [a-zA-Z0-9_]. +static inline bool isIdentifierBody(unsigned char c) { + return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; +} + +/// isHorizontalWhitespace - Return true if this character is horizontal +/// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'. +static inline bool isHorizontalWhitespace(unsigned char c) { + return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; +} + +/// isWhitespace - Return true if this character is horizontal or vertical +/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false +/// for '\0'. +static inline bool isWhitespace(unsigned char c) { + return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; +} + +/// isNumberBody - Return true if this is the body character of an +/// preprocessing number, which is [a-zA-Z0-9_.]. +static inline bool isNumberBody(unsigned char c) { + return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? + true : false; +} + + +//===----------------------------------------------------------------------===// +// Diagnostics forwarding code. +//===----------------------------------------------------------------------===// + +/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the +/// lexer buffer was all instantiated at a single point, perform the mapping. +/// This is currently only used for _Pragma implementation, so it is the slow +/// path of the hot getSourceLocation method. Do not allow it to be inlined. +static SourceLocation GetMappedTokenLoc(Preprocessor &PP, + SourceLocation FileLoc, + unsigned CharNo) DISABLE_INLINE; +static SourceLocation GetMappedTokenLoc(Preprocessor &PP, + SourceLocation FileLoc, + unsigned CharNo) { + // Otherwise, we're lexing "mapped tokens". This is used for things like + // _Pragma handling. Combine the instantiation location of FileLoc with the + // physical location. + SourceManager &SourceMgr = PP.getSourceManager(); + + // Create a new SLoc which is expanded from logical(FileLoc) but whose + // characters come from phys(FileLoc)+Offset. + SourceLocation VirtLoc = SourceMgr.getLogicalLoc(FileLoc); + SourceLocation PhysLoc = SourceMgr.getPhysicalLoc(FileLoc); + PhysLoc = SourceLocation::getFileLoc(PhysLoc.getFileID(), CharNo); + return SourceMgr.getInstantiationLoc(PhysLoc, VirtLoc); +} + +/// getSourceLocation - Return a source location identifier for the specified +/// offset in the current file. +SourceLocation Lexer::getSourceLocation(const char *Loc) const { + assert(Loc >= BufferStart && Loc <= BufferEnd && + "Location out of range for this buffer!"); + + // In the normal case, we're just lexing from a simple file buffer, return + // the file id from FileLoc with the offset specified. + unsigned CharNo = Loc-BufferStart; + if (FileLoc.isFileID()) + return SourceLocation::getFileLoc(FileLoc.getFileID(), CharNo); + + assert(PP && "This doesn't work on raw lexers"); + return GetMappedTokenLoc(*PP, FileLoc, CharNo); +} + +/// Diag - Forwarding function for diagnostics. This translate a source +/// position in the current buffer into a SourceLocation object for rendering. +void Lexer::Diag(const char *Loc, unsigned DiagID, + const std::string &Msg) const { + if (LexingRawMode && Diagnostic::isBuiltinNoteWarningOrExtension(DiagID)) + return; + PP->Diag(getSourceLocation(Loc), DiagID, Msg); +} +void Lexer::Diag(SourceLocation Loc, unsigned DiagID, + const std::string &Msg) const { + if (LexingRawMode && Diagnostic::isBuiltinNoteWarningOrExtension(DiagID)) + return; + PP->Diag(Loc, DiagID, Msg); +} + + +//===----------------------------------------------------------------------===// +// Trigraph and Escaped Newline Handling Code. +//===----------------------------------------------------------------------===// + +/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, +/// return the decoded trigraph letter it corresponds to, or '\0' if nothing. +static char GetTrigraphCharForLetter(char Letter) { + switch (Letter) { + default: return 0; + case '=': return '#'; + case ')': return ']'; + case '(': return '['; + case '!': return '|'; + case '\'': return '^'; + case '>': return '}'; + case '/': return '\\'; + case '<': return '{'; + case '-': return '~'; + } +} + +/// DecodeTrigraphChar - If the specified character is a legal trigraph when +/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, +/// return the result character. Finally, emit a warning about trigraph use +/// whether trigraphs are enabled or not. +static char DecodeTrigraphChar(const char *CP, Lexer *L) { + char Res = GetTrigraphCharForLetter(*CP); + if (Res && L) { + if (!L->getFeatures().Trigraphs) { + L->Diag(CP-2, diag::trigraph_ignored); + return 0; + } else { + L->Diag(CP-2, diag::trigraph_converted, std::string()+Res); + } + } + return Res; +} + +/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, +/// get its size, and return it. This is tricky in several cases: +/// 1. If currently at the start of a trigraph, we warn about the trigraph, +/// then either return the trigraph (skipping 3 chars) or the '?', +/// depending on whether trigraphs are enabled or not. +/// 2. If this is an escaped newline (potentially with whitespace between +/// the backslash and newline), implicitly skip the newline and return +/// the char after it. +/// 3. If this is a UCN, return it. FIXME: C++ UCN's? +/// +/// This handles the slow/uncommon case of the getCharAndSize method. Here we +/// know that we can accumulate into Size, and that we have already incremented +/// Ptr by Size bytes. +/// +/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should +/// be updated to match. +/// +char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, + Token *Tok) { + // If we have a slash, look for an escaped newline. + if (Ptr[0] == '\\') { + ++Size; + ++Ptr; +Slash: + // Common case, backslash-char where the char is not whitespace. + if (!isWhitespace(Ptr[0])) return '\\'; + + // See if we have optional whitespace characters followed by a newline. + { + unsigned SizeTmp = 0; + do { + ++SizeTmp; + if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') { + // Remember that this token needs to be cleaned. + if (Tok) Tok->setFlag(Token::NeedsCleaning); + + // Warn if there was whitespace between the backslash and newline. + if (SizeTmp != 1 && Tok) + Diag(Ptr, diag::backslash_newline_space); + + // If this is a \r\n or \n\r, skip the newlines. + if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') && + Ptr[SizeTmp-1] != Ptr[SizeTmp]) + ++SizeTmp; + + // Found backslash<whitespace><newline>. Parse the char after it. + Size += SizeTmp; + Ptr += SizeTmp; + // Use slow version to accumulate a correct size field. + return getCharAndSizeSlow(Ptr, Size, Tok); + } + } while (isWhitespace(Ptr[SizeTmp])); + } + + // Otherwise, this is not an escaped newline, just return the slash. + return '\\'; + } + + // If this is a trigraph, process it. + if (Ptr[0] == '?' && Ptr[1] == '?') { + // If this is actually a legal trigraph (not something like "??x"), emit + // a trigraph warning. If so, and if trigraphs are enabled, return it. + if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { + // Remember that this token needs to be cleaned. + if (Tok) Tok->setFlag(Token::NeedsCleaning); + + Ptr += 3; + Size += 3; + if (C == '\\') goto Slash; + return C; + } + } + + // If this is neither, return a single character. + ++Size; + return *Ptr; +} + + +/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the +/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, +/// and that we have already incremented Ptr by Size bytes. +/// +/// NOTE: When this method is updated, getCharAndSizeSlow (above) should +/// be updated to match. +char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, + const LangOptions &Features) { + // If we have a slash, look for an escaped newline. + if (Ptr[0] == '\\') { + ++Size; + ++Ptr; +Slash: + // Common case, backslash-char where the char is not whitespace. + if (!isWhitespace(Ptr[0])) return '\\'; + + // See if we have optional whitespace characters followed by a newline. + { + unsigned SizeTmp = 0; + do { + ++SizeTmp; + if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') { + + // If this is a \r\n or \n\r, skip the newlines. + if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') && + Ptr[SizeTmp-1] != Ptr[SizeTmp]) + ++SizeTmp; + + // Found backslash<whitespace><newline>. Parse the char after it. + Size += SizeTmp; + Ptr += SizeTmp; + + // Use slow version to accumulate a correct size field. + return getCharAndSizeSlowNoWarn(Ptr, Size, Features); + } + } while (isWhitespace(Ptr[SizeTmp])); + } + + // Otherwise, this is not an escaped newline, just return the slash. + return '\\'; + } + + // If this is a trigraph, process it. + if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { + // If this is actually a legal trigraph (not something like "??x"), return + // it. + if (char C = GetTrigraphCharForLetter(Ptr[2])) { + Ptr += 3; + Size += 3; + if (C == '\\') goto Slash; + return C; + } + } + + // If this is neither, return a single character. + ++Size; + return *Ptr; +} + +//===----------------------------------------------------------------------===// +// Helper methods for lexing. +//===----------------------------------------------------------------------===// + +void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { + // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] + unsigned Size; + unsigned char C = *CurPtr++; + while (isIdentifierBody(C)) { + C = *CurPtr++; + } + --CurPtr; // Back up over the skipped character. + + // Fast path, no $,\,? in identifier found. '\' might be an escaped newline + // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. + // FIXME: UCNs. + if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) { +FinishIdentifier: + const char *IdStart = BufferPtr; + FormTokenWithChars(Result, CurPtr); + Result.setKind(tok::identifier); + + // If we are in raw mode, return this identifier raw. There is no need to + // look up identifier information or attempt to macro expand it. + if (LexingRawMode) return; + + // Fill in Result.IdentifierInfo, looking up the identifier in the + // identifier table. + PP->LookUpIdentifierInfo(Result, IdStart); + + // Finally, now that we know we have an identifier, pass this off to the + // preprocessor, which may macro expand it or something. + return PP->HandleIdentifier(Result); + } + + // Otherwise, $,\,? in identifier found. Enter slower path. + + C = getCharAndSize(CurPtr, Size); + while (1) { + if (C == '$') { + // If we hit a $ and they are not supported in identifiers, we are done. + if (!Features.DollarIdents) goto FinishIdentifier; + + // Otherwise, emit a diagnostic and continue. + Diag(CurPtr, diag::ext_dollar_in_identifier); + CurPtr = ConsumeChar(CurPtr, Size, Result); + C = getCharAndSize(CurPtr, Size); + continue; + } else if (!isIdentifierBody(C)) { // FIXME: UCNs. + // Found end of identifier. + goto FinishIdentifier; + } + + // Otherwise, this character is good, consume it. + CurPtr = ConsumeChar(CurPtr, Size, Result); + + C = getCharAndSize(CurPtr, Size); + while (isIdentifierBody(C)) { // FIXME: UCNs. + CurPtr = ConsumeChar(CurPtr, Size, Result); + C = getCharAndSize(CurPtr, Size); + } + } +} + + +/// LexNumericConstant - Lex the remainer of a integer or floating point +/// constant. From[-1] is the first character lexed. Return the end of the +/// constant. +void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { + unsigned Size; + char C = getCharAndSize(CurPtr, Size); + char PrevCh = 0; + while (isNumberBody(C)) { // FIXME: UCNs? + CurPtr = ConsumeChar(CurPtr, Size, Result); + PrevCh = C; + C = getCharAndSize(CurPtr, Size); + } + + // If we fell out, check for a sign, due to 1e+12. If we have one, continue. + if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) + return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); + + // If we have a hex FP constant, continue. + if (Features.HexFloats && + (C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) + return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); + + Result.setKind(tok::numeric_constant); + + // Update the location of token as well as BufferPtr. + FormTokenWithChars(Result, CurPtr); +} + +/// LexStringLiteral - Lex the remainder of a string literal, after having lexed +/// either " or L". +void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide){ + const char *NulCharacter = 0; // Does this string contain the \0 character? + + char C = getAndAdvanceChar(CurPtr, Result); + while (C != '"') { + // Skip escaped characters. + if (C == '\\') { + // Skip the escaped character. + C = getAndAdvanceChar(CurPtr, Result); + } else if (C == '\n' || C == '\r' || // Newline. + (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. + if (!LexingRawMode) Diag(BufferPtr, diag::err_unterminated_string); + Result.setKind(tok::unknown); + FormTokenWithChars(Result, CurPtr-1); + return; + } else if (C == 0) { + NulCharacter = CurPtr-1; + } + C = getAndAdvanceChar(CurPtr, Result); + } + + // If a nul character existed in the string, warn about it. + if (NulCharacter) Diag(NulCharacter, diag::null_in_string); + + Result.setKind(Wide ? tok::wide_string_literal : tok::string_literal); + + // Update the location of the token as well as the BufferPtr instance var. + FormTokenWithChars(Result, CurPtr); +} + +/// LexAngledStringLiteral - Lex the remainder of an angled string literal, +/// after having lexed the '<' character. This is used for #include filenames. +void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { + const char *NulCharacter = 0; // Does this string contain the \0 character? + + char C = getAndAdvanceChar(CurPtr, Result); + while (C != '>') { + // Skip escaped characters. + if (C == '\\') { + // Skip the escaped character. + C = getAndAdvanceChar(CurPtr, Result); + } else if (C == '\n' || C == '\r' || // Newline. + (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. + if (!LexingRawMode) Diag(BufferPtr, diag::err_unterminated_string); + Result.setKind(tok::unknown); + FormTokenWithChars(Result, CurPtr-1); + return; + } else if (C == 0) { + NulCharacter = CurPtr-1; + } + C = getAndAdvanceChar(CurPtr, Result); + } + + // If a nul character existed in the string, warn about it. + if (NulCharacter) Diag(NulCharacter, diag::null_in_string); + + Result.setKind(tok::angle_string_literal); + + // Update the location of token as well as BufferPtr. + FormTokenWithChars(Result, CurPtr); +} + + +/// LexCharConstant - Lex the remainder of a character constant, after having +/// lexed either ' or L'. +void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { + const char *NulCharacter = 0; // Does this character contain the \0 character? + + // Handle the common case of 'x' and '\y' efficiently. + char C = getAndAdvanceChar(CurPtr, Result); + if (C == '\'') { + if (!LexingRawMode) Diag(BufferPtr, diag::err_empty_character); + Result.setKind(tok::unknown); + FormTokenWithChars(Result, CurPtr); + return; + } else if (C == '\\') { + // Skip the escaped character. + // FIXME: UCN's. + C = getAndAdvanceChar(CurPtr, Result); + } + + if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') { + ++CurPtr; + } else { + // Fall back on generic code for embedded nulls, newlines, wide chars. + do { + // Skip escaped characters. + if (C == '\\') { + // Skip the escaped character. + C = getAndAdvanceChar(CurPtr, Result); + } else if (C == '\n' || C == '\r' || // Newline. + (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. + if (!LexingRawMode) Diag(BufferPtr, diag::err_unterminated_char); + Result.setKind(tok::unknown); + FormTokenWithChars(Result, CurPtr-1); + return; + } else if (C == 0) { + NulCharacter = CurPtr-1; + } + C = getAndAdvanceChar(CurPtr, Result); + } while (C != '\''); + } + + if (NulCharacter) Diag(NulCharacter, diag::null_in_char); + + Result.setKind(tok::char_constant); + + // Update the location of token as well as BufferPtr. + FormTokenWithChars(Result, CurPtr); +} + +/// SkipWhitespace - Efficiently skip over a series of whitespace characters. +/// Update BufferPtr to point to the next non-whitespace character and return. +void Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { + // Whitespace - Skip it, then return the token after the whitespace. + unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. + while (1) { + // Skip horizontal whitespace very aggressively. + while (isHorizontalWhitespace(Char)) + Char = *++CurPtr; + + // Otherwise if we something other than whitespace, we're done. + if (Char != '\n' && Char != '\r') + break; + + if (ParsingPreprocessorDirective) { + // End of preprocessor directive line, let LexTokenInternal handle this. + BufferPtr = CurPtr; + return; + } + + // ok, but handle newline. + // The returned token is at the start of the line. + Result.setFlag(Token::StartOfLine); + // No leading whitespace seen so far. + Result.clearFlag(Token::LeadingSpace); + Char = *++CurPtr; + } + + // If this isn't immediately after a newline, there is leading space. + char PrevChar = CurPtr[-1]; + if (PrevChar != '\n' && PrevChar != '\r') + Result.setFlag(Token::LeadingSpace); + + BufferPtr = CurPtr; +} + +// SkipBCPLComment - We have just read the // characters from input. Skip until +// we find the newline character thats terminate the comment. Then update +/// BufferPtr and return. +bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { + // If BCPL comments aren't explicitly enabled for this language, emit an + // extension warning. + if (!Features.BCPLComment) { + Diag(BufferPtr, diag::ext_bcpl_comment); + + // Mark them enabled so we only emit one warning for this translation + // unit. + Features.BCPLComment = true; + } + + // Scan over the body of the comment. The common case, when scanning, is that + // the comment contains normal ascii characters with nothing interesting in + // them. As such, optimize for this case with the inner loop. + char C; + do { + C = *CurPtr; + // FIXME: Speedup BCPL comment lexing. Just scan for a \n or \r character. + // If we find a \n character, scan backwards, checking to see if it's an + // escaped newline, like we do for block comments. + + // Skip over characters in the fast loop. + while (C != 0 && // Potentially EOF. + C != '\\' && // Potentially escaped newline. + C != '?' && // Potentially trigraph. + C != '\n' && C != '\r') // Newline or DOS-style newline. + C = *++CurPtr; + + // If this is a newline, we're done. + if (C == '\n' || C == '\r') + break; // Found the newline? Break out! + + // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to + // properly decode the character. + const char *OldPtr = CurPtr; + C = getAndAdvanceChar(CurPtr, Result); + + // If we read multiple characters, and one of those characters was a \r or + // \n, then we had an escaped newline within the comment. Emit diagnostic + // unless the next line is also a // comment. + if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { + for (; OldPtr != CurPtr; ++OldPtr) + if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { + // Okay, we found a // comment that ends in a newline, if the next + // line is also a // comment, but has spaces, don't emit a diagnostic. + if (isspace(C)) { + const char *ForwardPtr = CurPtr; + while (isspace(*ForwardPtr)) // Skip whitespace. + ++ForwardPtr; + if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') + break; + } + + Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment); + break; + } + } + + if (CurPtr == BufferEnd+1) { --CurPtr; break; } + } while (C != '\n' && C != '\r'); + + // Found but did not consume the newline. + + // If we are returning comments as tokens, return this comment as a token. + if (KeepCommentMode) + return SaveBCPLComment(Result, CurPtr); + + // If we are inside a preprocessor directive and we see the end of line, + // return immediately, so that the lexer can return this as an EOM token. + if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { + BufferPtr = CurPtr; + return true; + } + + // Otherwise, eat the \n character. We don't care if this is a \n\r or + // \r\n sequence. + ++CurPtr; + + // The next returned token is at the start of the line. + Result.setFlag(Token::StartOfLine); + // No leading whitespace seen so far. + Result.clearFlag(Token::LeadingSpace); + BufferPtr = CurPtr; + return true; +} + +/// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in +/// an appropriate way and return it. +bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { + Result.setKind(tok::comment); + FormTokenWithChars(Result, CurPtr); + + // If this BCPL-style comment is in a macro definition, transmogrify it into + // a C-style block comment. + if (ParsingPreprocessorDirective) { + std::string Spelling = PP->getSpelling(Result); + assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); + Spelling[1] = '*'; // Change prefix to "/*". + Spelling += "*/"; // add suffix. + + Result.setLocation(PP->CreateString(&Spelling[0], Spelling.size(), + Result.getLocation())); + Result.setLength(Spelling.size()); + } + return false; +} + +/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline +/// character (either \n or \r) is part of an escaped newline sequence. Issue a +/// diagnostic if so. We know that the is inside of a block comment. +static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, + Lexer *L) { + assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); + + // Back up off the newline. + --CurPtr; + + // If this is a two-character newline sequence, skip the other character. + if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { + // \n\n or \r\r -> not escaped newline. + if (CurPtr[0] == CurPtr[1]) + return false; + // \n\r or \r\n -> skip the newline. + --CurPtr; + } + + // If we have horizontal whitespace, skip over it. We allow whitespace + // between the slash and newline. + bool HasSpace = false; + while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { + --CurPtr; + HasSpace = true; + } + + // If we have a slash, we know this is an escaped newline. + if (*CurPtr == '\\') { + if (CurPtr[-1] != '*') return false; + } else { + // It isn't a slash, is it the ?? / trigraph? + if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || + CurPtr[-3] != '*') + return false; + + // This is the trigraph ending the comment. Emit a stern warning! + CurPtr -= 2; + + // If no trigraphs are enabled, warn that we ignored this trigraph and + // ignore this * character. + if (!L->getFeatures().Trigraphs) { + L->Diag(CurPtr, diag::trigraph_ignored_block_comment); + return false; + } + L->Diag(CurPtr, diag::trigraph_ends_block_comment); + } + + // Warn about having an escaped newline between the */ characters. + L->Diag(CurPtr, diag::escaped_newline_block_comment_end); + + // If there was space between the backslash and newline, warn about it. + if (HasSpace) L->Diag(CurPtr, diag::backslash_newline_space); + + return true; +} + +#ifdef __SSE2__ +#include <emmintrin.h> +#elif __ALTIVEC__ +#include <altivec.h> +#undef bool +#endif + +/// SkipBlockComment - We have just read the /* characters from input. Read +/// until we find the */ characters that terminate the comment. Note that we +/// don't bother decoding trigraphs or escaped newlines in block comments, +/// because they cannot cause the comment to end. The only thing that can +/// happen is the comment could end with an escaped newline between the */ end +/// of comment. +bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { + // Scan one character past where we should, looking for a '/' character. Once + // we find it, check to see if it was preceeded by a *. This common + // optimization helps people who like to put a lot of * characters in their + // comments. + + // The first character we get with newlines and trigraphs skipped to handle + // the degenerate /*/ case below correctly if the * has an escaped newline + // after it. + unsigned CharSize; + unsigned char C = getCharAndSize(CurPtr, CharSize); + CurPtr += CharSize; + if (C == 0 && CurPtr == BufferEnd+1) { + Diag(BufferPtr, diag::err_unterminated_block_comment); + BufferPtr = CurPtr-1; + return true; + } + + // Check to see if the first character after the '/*' is another /. If so, + // then this slash does not end the block comment, it is part of it. + if (C == '/') + C = *CurPtr++; + + while (1) { + // Skip over all non-interesting characters until we find end of buffer or a + // (probably ending) '/' character. + if (CurPtr + 24 < BufferEnd) { + // While not aligned to a 16-byte boundary. + while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) + C = *CurPtr++; + + if (C == '/') goto FoundSlash; + +#ifdef __SSE2__ + __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/', + '/', '/', '/', '/', '/', '/', '/', '/'); + while (CurPtr+16 <= BufferEnd && + _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0) + CurPtr += 16; +#elif __ALTIVEC__ + __vector unsigned char Slashes = { + '/', '/', '/', '/', '/', '/', '/', '/', + '/', '/', '/', '/', '/', '/', '/', '/' + }; + while (CurPtr+16 <= BufferEnd && + !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) + CurPtr += 16; +#else + // Scan for '/' quickly. Many block comments are very large. + while (CurPtr[0] != '/' && + CurPtr[1] != '/' && + CurPtr[2] != '/' && + CurPtr[3] != '/' && + CurPtr+4 < BufferEnd) { + CurPtr += 4; + } +#endif + + // It has to be one of the bytes scanned, increment to it and read one. + C = *CurPtr++; + } + + // Loop to scan the remainder. + while (C != '/' && C != '\0') + C = *CurPtr++; + + FoundSlash: + if (C == '/') { + if (CurPtr[-2] == '*') // We found the final */. We're done! + break; + + if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { + if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { + // We found the final */, though it had an escaped newline between the + // * and /. We're done! + break; + } + } + if (CurPtr[0] == '*' && CurPtr[1] != '/') { + // If this is a /* inside of the comment, emit a warning. Don't do this + // if this is a /*/, which will end the comment. This misses cases with + // embedded escaped newlines, but oh well. + Diag(CurPtr-1, diag::nested_block_comment); + } + } else if (C == 0 && CurPtr == BufferEnd+1) { + Diag(BufferPtr, diag::err_unterminated_block_comment); + // Note: the user probably forgot a */. We could continue immediately + // after the /*, but this would involve lexing a lot of what really is the + // comment, which surely would confuse the parser. + BufferPtr = CurPtr-1; + return true; + } + C = *CurPtr++; + } + + // If we are returning comments as tokens, return this comment as a token. + if (KeepCommentMode) { + Result.setKind(tok::comment); + FormTokenWithChars(Result, CurPtr); + return false; + } + + // It is common for the tokens immediately after a /**/ comment to be + // whitespace. Instead of going through the big switch, handle it + // efficiently now. + if (isHorizontalWhitespace(*CurPtr)) { + Result.setFlag(Token::LeadingSpace); + SkipWhitespace(Result, CurPtr+1); + return true; + } + + // Otherwise, just return so that the next character will be lexed as a token. + BufferPtr = CurPtr; + Result.setFlag(Token::LeadingSpace); + return true; +} + +//===----------------------------------------------------------------------===// +// Primary Lexing Entry Points +//===----------------------------------------------------------------------===// + +/// LexIncludeFilename - After the preprocessor has parsed a #include, lex and +/// (potentially) macro expand the filename. +void Lexer::LexIncludeFilename(Token &FilenameTok) { + assert(ParsingPreprocessorDirective && + ParsingFilename == false && + "Must be in a preprocessing directive!"); + + // We are now parsing a filename! + ParsingFilename = true; + + // Lex the filename. + Lex(FilenameTok); + + // We should have obtained the filename now. + ParsingFilename = false; + + // No filename? + if (FilenameTok.is(tok::eom)) + Diag(FilenameTok.getLocation(), diag::err_pp_expects_filename); +} + +/// ReadToEndOfLine - Read the rest of the current preprocessor line as an +/// uninterpreted string. This switches the lexer out of directive mode. +std::string Lexer::ReadToEndOfLine() { + assert(ParsingPreprocessorDirective && ParsingFilename == false && + "Must be in a preprocessing directive!"); + std::string Result; + Token Tmp; + + // CurPtr - Cache BufferPtr in an automatic variable. + const char *CurPtr = BufferPtr; + while (1) { + char Char = getAndAdvanceChar(CurPtr, Tmp); + switch (Char) { + default: + Result += Char; + break; + case 0: // Null. + // Found end of file? + if (CurPtr-1 != BufferEnd) { + // Nope, normal character, continue. + Result += Char; + break; + } + // FALL THROUGH. + case '\r': + case '\n': + // Okay, we found the end of the line. First, back up past the \0, \r, \n. + assert(CurPtr[-1] == Char && "Trigraphs for newline?"); + BufferPtr = CurPtr-1; + + // Next, lex the character, which should handle the EOM transition. + Lex(Tmp); + assert(Tmp.is(tok::eom) && "Unexpected token!"); + + // Finally, we're done, return the string we found. + return Result; + } + } +} + +/// LexEndOfFile - CurPtr points to the end of this file. Handle this +/// condition, reporting diagnostics and handling other edge cases as required. +/// This returns true if Result contains a token, false if PP.Lex should be +/// called again. +bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { + // If we hit the end of the file while parsing a preprocessor directive, + // end the preprocessor directive first. The next token returned will + // then be the end of file. + if (ParsingPreprocessorDirective) { + // Done parsing the "line". + ParsingPreprocessorDirective = false; + Result.setKind(tok::eom); + // Update the location of token as well as BufferPtr. + FormTokenWithChars(Result, CurPtr); + + // Restore comment saving mode, in case it was disabled for directive. + KeepCommentMode = PP->getCommentRetentionState(); + return true; // Have a token. + } + + // If we are in raw mode, return this event as an EOF token. Let the caller + // that put us in raw mode handle the event. + if (LexingRawMode) { + Result.startToken(); + BufferPtr = BufferEnd; + FormTokenWithChars(Result, BufferEnd); + Result.setKind(tok::eof); + return true; + } + + // Otherwise, issue diagnostics for unterminated #if and missing newline. + + // If we are in a #if directive, emit an error. + while (!ConditionalStack.empty()) { + Diag(ConditionalStack.back().IfLoc, diag::err_pp_unterminated_conditional); + ConditionalStack.pop_back(); + } + + // If the file was empty or didn't end in a newline, issue a pedwarn. + if (CurPtr[-1] != '\n' && CurPtr[-1] != '\r') + Diag(BufferEnd, diag::ext_no_newline_eof); + + BufferPtr = CurPtr; + + // Finally, let the preprocessor handle this. + return PP->HandleEndOfFile(Result); +} + +/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from +/// the specified lexer will return a tok::l_paren token, 0 if it is something +/// else and 2 if there are no more tokens in the buffer controlled by the +/// lexer. +unsigned Lexer::isNextPPTokenLParen() { + assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); + + // Switch to 'skipping' mode. This will ensure that we can lex a token + // without emitting diagnostics, disables macro expansion, and will cause EOF + // to return an EOF token instead of popping the include stack. + LexingRawMode = true; + + // Save state that can be changed while lexing so that we can restore it. + const char *TmpBufferPtr = BufferPtr; + + Token Tok; + Tok.startToken(); + LexTokenInternal(Tok); + + // Restore state that may have changed. + BufferPtr = TmpBufferPtr; + + // Restore the lexer back to non-skipping mode. + LexingRawMode = false; + + if (Tok.is(tok::eof)) + return 2; + return Tok.is(tok::l_paren); +} + + +/// LexTokenInternal - This implements a simple C family lexer. It is an +/// extremely performance critical piece of code. This assumes that the buffer +/// has a null character at the end of the file. Return true if an error +/// occurred and compilation should terminate, false if normal. This returns a +/// preprocessing token, not a normal token, as such, it is an internal +/// interface. It assumes that the Flags of result have been cleared before +/// calling this. +void Lexer::LexTokenInternal(Token &Result) { +LexNextToken: + // New token, can't need cleaning yet. + Result.clearFlag(Token::NeedsCleaning); + Result.setIdentifierInfo(0); + + // CurPtr - Cache BufferPtr in an automatic variable. + const char *CurPtr = BufferPtr; + + // Small amounts of horizontal whitespace is very common between tokens. + if ((*CurPtr == ' ') || (*CurPtr == '\t')) { + ++CurPtr; + while ((*CurPtr == ' ') || (*CurPtr == '\t')) + ++CurPtr; + BufferPtr = CurPtr; + Result.setFlag(Token::LeadingSpace); + } + + unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. + + // Read a character, advancing over it. + char Char = getAndAdvanceChar(CurPtr, Result); + switch (Char) { + case 0: // Null. + // Found end of file? + if (CurPtr-1 == BufferEnd) { + // Read the PP instance variable into an automatic variable, because + // LexEndOfFile will often delete 'this'. + Preprocessor *PPCache = PP; + if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. + return; // Got a token to return. + assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); + return PPCache->Lex(Result); + } + + Diag(CurPtr-1, diag::null_in_file); + Result.setFlag(Token::LeadingSpace); + SkipWhitespace(Result, CurPtr); + goto LexNextToken; // GCC isn't tail call eliminating. + case '\n': + case '\r': + // If we are inside a preprocessor directive and we see the end of line, + // we know we are done with the directive, so return an EOM token. + if (ParsingPreprocessorDirective) { + // Done parsing the "line". + ParsingPreprocessorDirective = false; + + // Restore comment saving mode, in case it was disabled for directive. + KeepCommentMode = PP->getCommentRetentionState(); + + // Since we consumed a newline, we are back at the start of a line. + IsAtStartOfLine = true; + + Result.setKind(tok::eom); + break; + } + // The returned token is at the start of the line. + Result.setFlag(Token::StartOfLine); + // No leading whitespace seen so far. + Result.clearFlag(Token::LeadingSpace); + SkipWhitespace(Result, CurPtr); + goto LexNextToken; // GCC isn't tail call eliminating. + case ' ': + case '\t': + case '\f': + case '\v': + SkipHorizontalWhitespace: + Result.setFlag(Token::LeadingSpace); + SkipWhitespace(Result, CurPtr); + + SkipIgnoredUnits: + CurPtr = BufferPtr; + + // If the next token is obviously a // or /* */ comment, skip it efficiently + // too (without going through the big switch stmt). + if (CurPtr[0] == '/' && CurPtr[1] == '/' && !KeepCommentMode) { + SkipBCPLComment(Result, CurPtr+2); + goto SkipIgnoredUnits; + } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !KeepCommentMode) { + SkipBlockComment(Result, CurPtr+2); + goto SkipIgnoredUnits; + } else if (isHorizontalWhitespace(*CurPtr)) { + goto SkipHorizontalWhitespace; + } + goto LexNextToken; // GCC isn't tail call eliminating. + + // C99 6.4.4.1: Integer Constants. + // C99 6.4.4.2: Floating Constants. + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + return LexNumericConstant(Result, CurPtr); + + case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + Char = getCharAndSize(CurPtr, SizeTmp); + + // Wide string literal. + if (Char == '"') + return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), + true); + + // Wide character constant. + if (Char == '\'') + return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); + // FALL THROUGH, treating L like the start of an identifier. + + // C99 6.4.2: Identifiers. + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': + case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': + case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': + case 'V': case 'W': case 'X': case 'Y': case 'Z': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': + case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': + case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': + case 'v': case 'w': case 'x': case 'y': case 'z': + case '_': + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + return LexIdentifier(Result, CurPtr); + + case '$': // $ in identifiers. + if (Features.DollarIdents) { + Diag(CurPtr-1, diag::ext_dollar_in_identifier); + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + return LexIdentifier(Result, CurPtr); + } + + Result.setKind(tok::unknown); + break; + + // C99 6.4.4: Character Constants. + case '\'': + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + return LexCharConstant(Result, CurPtr); + + // C99 6.4.5: String Literals. + case '"': + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + return LexStringLiteral(Result, CurPtr, false); + + // C99 6.4.6: Punctuators. + case '?': + Result.setKind(tok::question); + break; + case '[': + Result.setKind(tok::l_square); + break; + case ']': + Result.setKind(tok::r_square); + break; + case '(': + Result.setKind(tok::l_paren); + break; + case ')': + Result.setKind(tok::r_paren); + break; + case '{': + Result.setKind(tok::l_brace); + break; + case '}': + Result.setKind(tok::r_brace); + break; + case '.': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char >= '0' && Char <= '9') { + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + + return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); + } else if (Features.CPlusPlus && Char == '*') { + Result.setKind(tok::periodstar); + CurPtr += SizeTmp; + } else if (Char == '.' && + getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { + Result.setKind(tok::ellipsis); + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); + } else { + Result.setKind(tok::period); + } + break; + case '&': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '&') { + Result.setKind(tok::ampamp); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Char == '=') { + Result.setKind(tok::ampequal); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Result.setKind(tok::amp); + } + break; + case '*': + if (getCharAndSize(CurPtr, SizeTmp) == '=') { + Result.setKind(tok::starequal); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Result.setKind(tok::star); + } + break; + case '+': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '+') { + Result.setKind(tok::plusplus); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Char == '=') { + Result.setKind(tok::plusequal); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Result.setKind(tok::plus); + } + break; + case '-': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '-') { + Result.setKind(tok::minusminus); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Char == '>' && Features.CPlusPlus && + getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { + Result.setKind(tok::arrowstar); // C++ ->* + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); + } else if (Char == '>') { + Result.setKind(tok::arrow); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Char == '=') { + Result.setKind(tok::minusequal); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Result.setKind(tok::minus); + } + break; + case '~': + Result.setKind(tok::tilde); + break; + case '!': + if (getCharAndSize(CurPtr, SizeTmp) == '=') { + Result.setKind(tok::exclaimequal); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Result.setKind(tok::exclaim); + } + break; + case '/': + // 6.4.9: Comments + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '/') { // BCPL comment. + if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) { + // It is common for the tokens immediately after a // comment to be + // whitespace (indentation for the next line). Instead of going through + // the big switch, handle it efficiently now. + goto SkipIgnoredUnits; + } + return; // KeepCommentMode + } else if (Char == '*') { // /**/ comment. + if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) + goto LexNextToken; // GCC isn't tail call eliminating. + return; // KeepCommentMode + } else if (Char == '=') { + Result.setKind(tok::slashequal); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Result.setKind(tok::slash); + } + break; + case '%': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '=') { + Result.setKind(tok::percentequal); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Features.Digraphs && Char == '>') { + Result.setKind(tok::r_brace); // '%>' -> '}' + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Features.Digraphs && Char == ':') { + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { + Result.setKind(tok::hashhash); // '%:%:' -> '##' + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); + } else if (Char == '@' && Features.Microsoft) { // %:@ -> #@ -> Charize + Result.setKind(tok::hashat); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + Diag(BufferPtr, diag::charize_microsoft_ext); + } else { + Result.setKind(tok::hash); // '%:' -> '#' + + // We parsed a # character. If this occurs at the start of the line, + // it's actually the start of a preprocessing directive. Callback to + // the preprocessor to handle it. + // FIXME: -fpreprocessed mode?? + if (Result.isAtStartOfLine() && !LexingRawMode) { + BufferPtr = CurPtr; + PP->HandleDirective(Result); + + // As an optimization, if the preprocessor didn't switch lexers, tail + // recurse. + if (PP->isCurrentLexer(this)) { + // Start a new token. If this is a #include or something, the PP may + // want us starting at the beginning of the line again. If so, set + // the StartOfLine flag. + if (IsAtStartOfLine) { + Result.setFlag(Token::StartOfLine); + IsAtStartOfLine = false; + } + goto LexNextToken; // GCC isn't tail call eliminating. + } + + return PP->Lex(Result); + } + } + } else { + Result.setKind(tok::percent); + } + break; + case '<': + Char = getCharAndSize(CurPtr, SizeTmp); + if (ParsingFilename) { + return LexAngledStringLiteral(Result, CurPtr+SizeTmp); + } else if (Char == '<' && + getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') { + Result.setKind(tok::lesslessequal); + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); + } else if (Char == '<') { + Result.setKind(tok::lessless); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Char == '=') { + Result.setKind(tok::lessequal); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Features.Digraphs && Char == ':') { + Result.setKind(tok::l_square); // '<:' -> '[' + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Features.Digraphs && Char == '%') { + Result.setKind(tok::l_brace); // '<%' -> '{' + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Result.setKind(tok::less); + } + break; + case '>': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '=') { + Result.setKind(tok::greaterequal); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Char == '>' && + getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') { + Result.setKind(tok::greatergreaterequal); + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); + } else if (Char == '>') { + Result.setKind(tok::greatergreater); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Result.setKind(tok::greater); + } + break; + case '^': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '=') { + Result.setKind(tok::caretequal); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Result.setKind(tok::caret); + } + break; + case '|': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '=') { + Result.setKind(tok::pipeequal); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Char == '|') { + Result.setKind(tok::pipepipe); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Result.setKind(tok::pipe); + } + break; + case ':': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Features.Digraphs && Char == '>') { + Result.setKind(tok::r_square); // ':>' -> ']' + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Features.CPlusPlus && Char == ':') { + Result.setKind(tok::coloncolon); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Result.setKind(tok::colon); + } + break; + case ';': + Result.setKind(tok::semi); + break; + case '=': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '=') { + Result.setKind(tok::equalequal); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Result.setKind(tok::equal); + } + break; + case ',': + Result.setKind(tok::comma); + break; + case '#': + Char = getCharAndSize(CurPtr, SizeTmp); + if (Char == '#') { + Result.setKind(tok::hashhash); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else if (Char == '@' && Features.Microsoft) { // #@ -> Charize + Result.setKind(tok::hashat); + Diag(BufferPtr, diag::charize_microsoft_ext); + CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); + } else { + Result.setKind(tok::hash); + // We parsed a # character. If this occurs at the start of the line, + // it's actually the start of a preprocessing directive. Callback to + // the preprocessor to handle it. + // FIXME: -fpreprocessed mode?? + if (Result.isAtStartOfLine() && !LexingRawMode) { + BufferPtr = CurPtr; + PP->HandleDirective(Result); + + // As an optimization, if the preprocessor didn't switch lexers, tail + // recurse. + if (PP->isCurrentLexer(this)) { + // Start a new token. If this is a #include or something, the PP may + // want us starting at the beginning of the line again. If so, set + // the StartOfLine flag. + if (IsAtStartOfLine) { + Result.setFlag(Token::StartOfLine); + IsAtStartOfLine = false; + } + goto LexNextToken; // GCC isn't tail call eliminating. + } + return PP->Lex(Result); + } + } + break; + + case '@': + // Objective C support. + if (CurPtr[-1] == '@' && Features.ObjC1) + Result.setKind(tok::at); + else + Result.setKind(tok::unknown); + break; + + case '\\': + // FIXME: UCN's. + // FALL THROUGH. + default: + Result.setKind(tok::unknown); + break; + } + + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + + // Update the location of token as well as BufferPtr. + FormTokenWithChars(Result, CurPtr); +} diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp new file mode 100644 index 00000000000..aa0b831af90 --- /dev/null +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -0,0 +1,691 @@ +//===--- LiteralSupport.cpp - Code to parse and process literals ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the NumericLiteralParser, CharLiteralParser, and +// StringLiteralParser interfaces. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/LiteralSupport.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TargetInfo.h" +#include "llvm/ADT/StringExtras.h" +using namespace clang; + +/// HexDigitValue - Return the value of the specified hex digit, or -1 if it's +/// not valid. +static int HexDigitValue(char C) { + if (C >= '0' && C <= '9') return C-'0'; + if (C >= 'a' && C <= 'f') return C-'a'+10; + if (C >= 'A' && C <= 'F') return C-'A'+10; + return -1; +} + +/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in +/// either a character or a string literal. +static unsigned ProcessCharEscape(const char *&ThisTokBuf, + const char *ThisTokEnd, bool &HadError, + SourceLocation Loc, bool IsWide, + Preprocessor &PP) { + // Skip the '\' char. + ++ThisTokBuf; + + // We know that this character can't be off the end of the buffer, because + // that would have been \", which would not have been the end of string. + unsigned ResultChar = *ThisTokBuf++; + switch (ResultChar) { + // These map to themselves. + case '\\': case '\'': case '"': case '?': break; + + // These have fixed mappings. + case 'a': + // TODO: K&R: the meaning of '\\a' is different in traditional C + ResultChar = 7; + break; + case 'b': + ResultChar = 8; + break; + case 'e': + PP.Diag(Loc, diag::ext_nonstandard_escape, "e"); + ResultChar = 27; + break; + case 'f': + ResultChar = 12; + break; + case 'n': + ResultChar = 10; + break; + case 'r': + ResultChar = 13; + break; + case 't': + ResultChar = 9; + break; + case 'v': + ResultChar = 11; + break; + + //case 'u': case 'U': // FIXME: UCNs. + case 'x': { // Hex escape. + ResultChar = 0; + if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) { + PP.Diag(Loc, diag::err_hex_escape_no_digits); + HadError = 1; + break; + } + + // Hex escapes are a maximal series of hex digits. + bool Overflow = false; + for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) { + int CharVal = HexDigitValue(ThisTokBuf[0]); + if (CharVal == -1) break; + Overflow |= (ResultChar & 0xF0000000) ? true : false; // About to shift out a digit? + ResultChar <<= 4; + ResultChar |= CharVal; + } + + // See if any bits will be truncated when evaluated as a character. + unsigned CharWidth = PP.getTargetInfo().getCharWidth(IsWide); + + if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { + Overflow = true; + ResultChar &= ~0U >> (32-CharWidth); + } + + // Check for overflow. + if (Overflow) // Too many digits to fit in + PP.Diag(Loc, diag::warn_hex_escape_too_large); + break; + } + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': { + // Octal escapes. + --ThisTokBuf; + ResultChar = 0; + + // Octal escapes are a series of octal digits with maximum length 3. + // "\0123" is a two digit sequence equal to "\012" "3". + unsigned NumDigits = 0; + do { + ResultChar <<= 3; + ResultChar |= *ThisTokBuf++ - '0'; + ++NumDigits; + } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 && + ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7'); + + // Check for overflow. Reject '\777', but not L'\777'. + unsigned CharWidth = PP.getTargetInfo().getCharWidth(IsWide); + + if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { + PP.Diag(Loc, diag::warn_octal_escape_too_large); + ResultChar &= ~0U >> (32-CharWidth); + } + break; + } + + // Otherwise, these are not valid escapes. + case '(': case '{': case '[': case '%': + // GCC accepts these as extensions. We warn about them as such though. + if (!PP.getLangOptions().NoExtensions) { + PP.Diag(Loc, diag::ext_nonstandard_escape, + std::string()+(char)ResultChar); + break; + } + // FALL THROUGH. + default: + if (isgraph(ThisTokBuf[0])) { + PP.Diag(Loc, diag::ext_unknown_escape, std::string()+(char)ResultChar); + } else { + PP.Diag(Loc, diag::ext_unknown_escape, "x"+llvm::utohexstr(ResultChar)); + } + break; + } + + return ResultChar; +} + + + + +/// integer-constant: [C99 6.4.4.1] +/// decimal-constant integer-suffix +/// octal-constant integer-suffix +/// hexadecimal-constant integer-suffix +/// decimal-constant: +/// nonzero-digit +/// decimal-constant digit +/// octal-constant: +/// 0 +/// octal-constant octal-digit +/// hexadecimal-constant: +/// hexadecimal-prefix hexadecimal-digit +/// hexadecimal-constant hexadecimal-digit +/// hexadecimal-prefix: one of +/// 0x 0X +/// integer-suffix: +/// unsigned-suffix [long-suffix] +/// unsigned-suffix [long-long-suffix] +/// long-suffix [unsigned-suffix] +/// long-long-suffix [unsigned-sufix] +/// nonzero-digit: +/// 1 2 3 4 5 6 7 8 9 +/// octal-digit: +/// 0 1 2 3 4 5 6 7 +/// hexadecimal-digit: +/// 0 1 2 3 4 5 6 7 8 9 +/// a b c d e f +/// A B C D E F +/// unsigned-suffix: one of +/// u U +/// long-suffix: one of +/// l L +/// long-long-suffix: one of +/// ll LL +/// +/// floating-constant: [C99 6.4.4.2] +/// TODO: add rules... +/// + +NumericLiteralParser:: +NumericLiteralParser(const char *begin, const char *end, + SourceLocation TokLoc, Preprocessor &pp) + : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) { + s = DigitsBegin = begin; + saw_exponent = false; + saw_period = false; + isLong = false; + isUnsigned = false; + isLongLong = false; + isFloat = false; + isImaginary = false; + hadError = false; + + if (*s == '0') { // parse radix + s++; + if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) { + s++; + radix = 16; + DigitsBegin = s; + s = SkipHexDigits(s); + if (s == ThisTokEnd) { + // Done. + } else if (*s == '.') { + s++; + saw_period = true; + s = SkipHexDigits(s); + } + // A binary exponent can appear with or with a '.'. If dotted, the + // binary exponent is required. + if ((*s == 'p' || *s == 'P') && PP.getLangOptions().HexFloats) { + s++; + saw_exponent = true; + if (*s == '+' || *s == '-') s++; // sign + const char *first_non_digit = SkipDigits(s); + if (first_non_digit == s) { + Diag(TokLoc, diag::err_exponent_has_no_digits); + return; + } else { + s = first_non_digit; + } + } else if (saw_period) { + Diag(TokLoc, diag::err_hexconstant_requires_exponent); + return; + } + } else if (*s == 'b' || *s == 'B') { + // 0b101010 is a GCC extension. + ++s; + radix = 2; + DigitsBegin = s; + s = SkipBinaryDigits(s); + if (s == ThisTokEnd) { + // Done. + } else if (isxdigit(*s)) { + Diag(TokLoc, diag::err_invalid_binary_digit, std::string(s, s+1)); + return; + } + PP.Diag(TokLoc, diag::ext_binary_literal); + } else { + // For now, the radix is set to 8. If we discover that we have a + // floating point constant, the radix will change to 10. Octal floating + // point constants are not permitted (only decimal and hexadecimal). + radix = 8; + DigitsBegin = s; + s = SkipOctalDigits(s); + if (s == ThisTokEnd) { + // Done. + } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) { + TokLoc = PP.AdvanceToTokenCharacter(TokLoc, s-begin); + Diag(TokLoc, diag::err_invalid_octal_digit, std::string(s, s+1)); + return; + } else if (*s == '.') { + s++; + radix = 10; + saw_period = true; + s = SkipDigits(s); + } + if (*s == 'e' || *s == 'E') { // exponent + s++; + radix = 10; + saw_exponent = true; + if (*s == '+' || *s == '-') s++; // sign + const char *first_non_digit = SkipDigits(s); + if (first_non_digit == s) { + Diag(TokLoc, diag::err_exponent_has_no_digits); + return; + } else { + s = first_non_digit; + } + } + } + } else { // the first digit is non-zero + radix = 10; + s = SkipDigits(s); + if (s == ThisTokEnd) { + // Done. + } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) { + Diag(TokLoc, diag::err_invalid_decimal_digit, std::string(s, s+1)); + return; + } else if (*s == '.') { + s++; + saw_period = true; + s = SkipDigits(s); + } + if (*s == 'e' || *s == 'E') { // exponent + s++; + saw_exponent = true; + if (*s == '+' || *s == '-') s++; // sign + const char *first_non_digit = SkipDigits(s); + if (first_non_digit == s) { + Diag(TokLoc, diag::err_exponent_has_no_digits); + return; + } else { + s = first_non_digit; + } + } + } + + SuffixBegin = s; + + // Parse the suffix. At this point we can classify whether we have an FP or + // integer constant. + bool isFPConstant = isFloatingLiteral(); + + // Loop over all of the characters of the suffix. If we see something bad, + // we break out of the loop. + for (; s != ThisTokEnd; ++s) { + switch (*s) { + case 'f': // FP Suffix for "float" + case 'F': + if (!isFPConstant) break; // Error for integer constant. + if (isFloat || isLong) break; // FF, LF invalid. + isFloat = true; + continue; // Success. + case 'u': + case 'U': + if (isFPConstant) break; // Error for floating constant. + if (isUnsigned) break; // Cannot be repeated. + isUnsigned = true; + continue; // Success. + case 'l': + case 'L': + if (isLong || isLongLong) break; // Cannot be repeated. + if (isFloat) break; // LF invalid. + + // Check for long long. The L's need to be adjacent and the same case. + if (s+1 != ThisTokEnd && s[1] == s[0]) { + if (isFPConstant) break; // long long invalid for floats. + isLongLong = true; + ++s; // Eat both of them. + } else { + isLong = true; + } + continue; // Success. + case 'i': + case 'I': + case 'j': + case 'J': + if (isImaginary) break; // Cannot be repeated. + PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin), + diag::ext_imaginary_constant); + isImaginary = true; + continue; // Success. + } + // If we reached here, there was an error. + break; + } + + // Report an error if there are any. + if (s != ThisTokEnd) { + TokLoc = PP.AdvanceToTokenCharacter(TokLoc, s-begin); + Diag(TokLoc, isFPConstant ? diag::err_invalid_suffix_float_constant : + diag::err_invalid_suffix_integer_constant, + std::string(SuffixBegin, ThisTokEnd)); + return; + } +} + +/// GetIntegerValue - Convert this numeric literal value to an APInt that +/// matches Val's input width. If there is an overflow, set Val to the low bits +/// of the result and return true. Otherwise, return false. +bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) { + Val = 0; + s = DigitsBegin; + + llvm::APInt RadixVal(Val.getBitWidth(), radix); + llvm::APInt CharVal(Val.getBitWidth(), 0); + llvm::APInt OldVal = Val; + + bool OverflowOccurred = false; + while (s < SuffixBegin) { + unsigned C = HexDigitValue(*s++); + + // If this letter is out of bound for this radix, reject it. + assert(C < radix && "NumericLiteralParser ctor should have rejected this"); + + CharVal = C; + + // Add the digit to the value in the appropriate radix. If adding in digits + // made the value smaller, then this overflowed. + OldVal = Val; + + // Multiply by radix, did overflow occur on the multiply? + Val *= RadixVal; + OverflowOccurred |= Val.udiv(RadixVal) != OldVal; + + OldVal = Val; + // Add value, did overflow occur on the value? + Val += CharVal; + OverflowOccurred |= Val.ult(OldVal); + OverflowOccurred |= Val.ult(CharVal); + } + return OverflowOccurred; +} + +llvm::APFloat NumericLiteralParser:: +GetFloatValue(const llvm::fltSemantics &Format, bool* isExact) { + using llvm::APFloat; + + llvm::SmallVector<char,256> floatChars; + for (unsigned i = 0, n = ThisTokEnd-ThisTokBegin; i != n; ++i) + floatChars.push_back(ThisTokBegin[i]); + + floatChars.push_back('\0'); + + APFloat V (Format, APFloat::fcZero, false); + APFloat::opStatus status; + + status = V.convertFromString(&floatChars[0],APFloat::rmNearestTiesToEven); + + if (isExact) + *isExact = status == APFloat::opOK; + + return V; +} + +void NumericLiteralParser::Diag(SourceLocation Loc, unsigned DiagID, + const std::string &M) { + PP.Diag(Loc, DiagID, M); + hadError = true; +} + + +CharLiteralParser::CharLiteralParser(const char *begin, const char *end, + SourceLocation Loc, Preprocessor &PP) { + // At this point we know that the character matches the regex "L?'.*'". + HadError = false; + Value = 0; + + // Determine if this is a wide character. + IsWide = begin[0] == 'L'; + if (IsWide) ++begin; + + // Skip over the entry quote. + assert(begin[0] == '\'' && "Invalid token lexed"); + ++begin; + + // FIXME: This assumes that 'int' is 32-bits in overflow calculation, and the + // size of "value". + assert(PP.getTargetInfo().getIntWidth() == 32 && + "Assumes sizeof(int) == 4 for now"); + // FIXME: This assumes that wchar_t is 32-bits for now. + assert(PP.getTargetInfo().getWCharWidth() == 32 && + "Assumes sizeof(wchar_t) == 4 for now"); + // FIXME: This extensively assumes that 'char' is 8-bits. + assert(PP.getTargetInfo().getCharWidth() == 8 && + "Assumes char is 8 bits"); + + bool isFirstChar = true; + bool isMultiChar = false; + while (begin[0] != '\'') { + unsigned ResultChar; + if (begin[0] != '\\') // If this is a normal character, consume it. + ResultChar = *begin++; + else // Otherwise, this is an escape character. + ResultChar = ProcessCharEscape(begin, end, HadError, Loc, IsWide, PP); + + // If this is a multi-character constant (e.g. 'abc'), handle it. These are + // implementation defined (C99 6.4.4.4p10). + if (!isFirstChar) { + // If this is the second character being processed, do special handling. + if (!isMultiChar) { + isMultiChar = true; + + // Warn about discarding the top bits for multi-char wide-character + // constants (L'abcd'). + if (IsWide) + PP.Diag(Loc, diag::warn_extraneous_wide_char_constant); + } + + if (IsWide) { + // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'. + Value = 0; + } else { + // Narrow character literals act as though their value is concatenated + // in this implementation. + if (((Value << 8) >> 8) != Value) + PP.Diag(Loc, diag::warn_char_constant_too_large); + Value <<= 8; + } + } + + Value += ResultChar; + isFirstChar = false; + } + + // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1") + // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple + // character constants are not sign extended in the this implementation: + // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC. + if (!IsWide && !isMultiChar && (Value & 128) && + PP.getTargetInfo().isCharSigned()) + Value = (signed char)Value; +} + + +/// string-literal: [C99 6.4.5] +/// " [s-char-sequence] " +/// L" [s-char-sequence] " +/// s-char-sequence: +/// s-char +/// s-char-sequence s-char +/// s-char: +/// any source character except the double quote ", +/// backslash \, or newline character +/// escape-character +/// universal-character-name +/// escape-character: [C99 6.4.4.4] +/// \ escape-code +/// universal-character-name +/// escape-code: +/// character-escape-code +/// octal-escape-code +/// hex-escape-code +/// character-escape-code: one of +/// n t b r f v a +/// \ ' " ? +/// octal-escape-code: +/// octal-digit +/// octal-digit octal-digit +/// octal-digit octal-digit octal-digit +/// hex-escape-code: +/// x hex-digit +/// hex-escape-code hex-digit +/// universal-character-name: +/// \u hex-quad +/// \U hex-quad hex-quad +/// hex-quad: +/// hex-digit hex-digit hex-digit hex-digit +/// +StringLiteralParser:: +StringLiteralParser(const Token *StringToks, unsigned NumStringToks, + Preprocessor &pp, TargetInfo &t) + : PP(pp), Target(t) { + // Scan all of the string portions, remember the max individual token length, + // computing a bound on the concatenated string length, and see whether any + // piece is a wide-string. If any of the string portions is a wide-string + // literal, the result is a wide-string literal [C99 6.4.5p4]. + MaxTokenLength = StringToks[0].getLength(); + SizeBound = StringToks[0].getLength()-2; // -2 for "". + AnyWide = StringToks[0].is(tok::wide_string_literal); + + hadError = false; + + // Implement Translation Phase #6: concatenation of string literals + /// (C99 5.1.1.2p1). The common case is only one string fragment. + for (unsigned i = 1; i != NumStringToks; ++i) { + // The string could be shorter than this if it needs cleaning, but this is a + // reasonable bound, which is all we need. + SizeBound += StringToks[i].getLength()-2; // -2 for "". + + // Remember maximum string piece length. + if (StringToks[i].getLength() > MaxTokenLength) + MaxTokenLength = StringToks[i].getLength(); + + // Remember if we see any wide strings. + AnyWide |= StringToks[i].is(tok::wide_string_literal); + } + + + // Include space for the null terminator. + ++SizeBound; + + // TODO: K&R warning: "traditional C rejects string constant concatenation" + + // Get the width in bytes of wchar_t. If no wchar_t strings are used, do not + // query the target. As such, wchar_tByteWidth is only valid if AnyWide=true. + wchar_tByteWidth = ~0U; + if (AnyWide) { + wchar_tByteWidth = Target.getWCharWidth(); + assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!"); + wchar_tByteWidth /= 8; + } + + // The output buffer size needs to be large enough to hold wide characters. + // This is a worst-case assumption which basically corresponds to L"" "long". + if (AnyWide) + SizeBound *= wchar_tByteWidth; + + // Size the temporary buffer to hold the result string data. + ResultBuf.resize(SizeBound); + + // Likewise, but for each string piece. + llvm::SmallString<512> TokenBuf; + TokenBuf.resize(MaxTokenLength); + + // Loop over all the strings, getting their spelling, and expanding them to + // wide strings as appropriate. + ResultPtr = &ResultBuf[0]; // Next byte to fill in. + + Pascal = false; + + for (unsigned i = 0, e = NumStringToks; i != e; ++i) { + const char *ThisTokBuf = &TokenBuf[0]; + // Get the spelling of the token, which eliminates trigraphs, etc. We know + // that ThisTokBuf points to a buffer that is big enough for the whole token + // and 'spelled' tokens can only shrink. + unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf); + const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote. + + // TODO: Input character set mapping support. + + // Skip L marker for wide strings. + bool ThisIsWide = false; + if (ThisTokBuf[0] == 'L') { + ++ThisTokBuf; + ThisIsWide = true; + } + + assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?"); + ++ThisTokBuf; + + // Check if this is a pascal string + if (pp.getLangOptions().PascalStrings && ThisTokBuf + 1 != ThisTokEnd && + ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') { + + // If the \p sequence is found in the first token, we have a pascal string + // Otherwise, if we already have a pascal string, ignore the first \p + if (i == 0) { + ++ThisTokBuf; + Pascal = true; + } else if (Pascal) + ThisTokBuf += 2; + } + + while (ThisTokBuf != ThisTokEnd) { + // Is this a span of non-escape characters? + if (ThisTokBuf[0] != '\\') { + const char *InStart = ThisTokBuf; + do { + ++ThisTokBuf; + } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); + + // Copy the character span over. + unsigned Len = ThisTokBuf-InStart; + if (!AnyWide) { + memcpy(ResultPtr, InStart, Len); + ResultPtr += Len; + } else { + // Note: our internal rep of wide char tokens is always little-endian. + for (; Len; --Len, ++InStart) { + *ResultPtr++ = InStart[0]; + // Add zeros at the end. + for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) + *ResultPtr++ = 0; + } + } + continue; + } + + // Otherwise, this is an escape character. Process it. + unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError, + StringToks[i].getLocation(), + ThisIsWide, PP); + + // Note: our internal rep of wide char tokens is always little-endian. + *ResultPtr++ = ResultChar & 0xFF; + + if (AnyWide) { + for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) + *ResultPtr++ = ResultChar >> i*8; + } + } + } + + // Add zero terminator. + *ResultPtr = 0; + if (AnyWide) { + for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) + *ResultPtr++ = 0; + } + + if (Pascal) + ResultBuf[0] = ResultPtr-&ResultBuf[0]-1; +} diff --git a/clang/lib/Lex/MacroArgs.cpp b/clang/lib/Lex/MacroArgs.cpp new file mode 100644 index 00000000000..a26e50eb762 --- /dev/null +++ b/clang/lib/Lex/MacroArgs.cpp @@ -0,0 +1,225 @@ +//===--- TokenLexer.cpp - Lex from a token stream -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the TokenLexer interface. +// +//===----------------------------------------------------------------------===// + +#include "MacroArgs.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Basic/Diagnostic.h" +using namespace clang; + +/// MacroArgs ctor function - This destroys the vector passed in. +MacroArgs *MacroArgs::create(const MacroInfo *MI, + const Token *UnexpArgTokens, + unsigned NumToks, bool VarargsElided) { + assert(MI->isFunctionLike() && + "Can't have args for an object-like macro!"); + + // Allocate memory for the MacroArgs object with the lexer tokens at the end. + MacroArgs *Result = (MacroArgs*)malloc(sizeof(MacroArgs) + + NumToks*sizeof(Token)); + // Construct the macroargs object. + new (Result) MacroArgs(NumToks, VarargsElided); + + // Copy the actual unexpanded tokens to immediately after the result ptr. + if (NumToks) + memcpy(const_cast<Token*>(Result->getUnexpArgument(0)), + UnexpArgTokens, NumToks*sizeof(Token)); + + return Result; +} + +/// destroy - Destroy and deallocate the memory for this object. +/// +void MacroArgs::destroy() { + // Run the dtor to deallocate the vectors. + this->~MacroArgs(); + // Release the memory for the object. + free(this); +} + + +/// getArgLength - Given a pointer to an expanded or unexpanded argument, +/// return the number of tokens, not counting the EOF, that make up the +/// argument. +unsigned MacroArgs::getArgLength(const Token *ArgPtr) { + unsigned NumArgTokens = 0; + for (; ArgPtr->isNot(tok::eof); ++ArgPtr) + ++NumArgTokens; + return NumArgTokens; +} + + +/// getUnexpArgument - Return the unexpanded tokens for the specified formal. +/// +const Token *MacroArgs::getUnexpArgument(unsigned Arg) const { + // The unexpanded argument tokens start immediately after the MacroArgs object + // in memory. + const Token *Start = (const Token *)(this+1); + const Token *Result = Start; + // Scan to find Arg. + for (; Arg; ++Result) { + assert(Result < Start+NumUnexpArgTokens && "Invalid arg #"); + if (Result->is(tok::eof)) + --Arg; + } + return Result; +} + + +/// ArgNeedsPreexpansion - If we can prove that the argument won't be affected +/// by pre-expansion, return false. Otherwise, conservatively return true. +bool MacroArgs::ArgNeedsPreexpansion(const Token *ArgTok, + Preprocessor &PP) const { + // If there are no identifiers in the argument list, or if the identifiers are + // known to not be macros, pre-expansion won't modify it. + for (; ArgTok->isNot(tok::eof); ++ArgTok) + if (IdentifierInfo *II = ArgTok->getIdentifierInfo()) { + if (II->hasMacroDefinition() && PP.getMacroInfo(II)->isEnabled()) + // Return true even though the macro could be a function-like macro + // without a following '(' token. + return true; + } + return false; +} + +/// getPreExpArgument - Return the pre-expanded form of the specified +/// argument. +const std::vector<Token> & +MacroArgs::getPreExpArgument(unsigned Arg, Preprocessor &PP) { + assert(Arg < NumUnexpArgTokens && "Invalid argument number!"); + + // If we have already computed this, return it. + if (PreExpArgTokens.empty()) + PreExpArgTokens.resize(NumUnexpArgTokens); + + std::vector<Token> &Result = PreExpArgTokens[Arg]; + if (!Result.empty()) return Result; + + const Token *AT = getUnexpArgument(Arg); + unsigned NumToks = getArgLength(AT)+1; // Include the EOF. + + // Otherwise, we have to pre-expand this argument, populating Result. To do + // this, we set up a fake TokenLexer to lex from the unexpanded argument + // list. With this installed, we lex expanded tokens until we hit the EOF + // token at the end of the unexp list. + PP.EnterTokenStream(AT, NumToks, false /*disable expand*/, + false /*owns tokens*/); + + // Lex all of the macro-expanded tokens into Result. + do { + Result.push_back(Token()); + PP.Lex(Result.back()); + } while (Result.back().isNot(tok::eof)); + + // Pop the token stream off the top of the stack. We know that the internal + // pointer inside of it is to the "end" of the token stream, but the stack + // will not otherwise be popped until the next token is lexed. The problem is + // that the token may be lexed sometime after the vector of tokens itself is + // destroyed, which would be badness. + PP.RemoveTopOfLexerStack(); + return Result; +} + + +/// StringifyArgument - Implement C99 6.10.3.2p2, converting a sequence of +/// tokens into the literal string token that should be produced by the C # +/// preprocessor operator. If Charify is true, then it should be turned into +/// a character literal for the Microsoft charize (#@) extension. +/// +Token MacroArgs::StringifyArgument(const Token *ArgToks, + Preprocessor &PP, bool Charify) { + Token Tok; + Tok.startToken(); + Tok.setKind(tok::string_literal); + + const Token *ArgTokStart = ArgToks; + + // Stringify all the tokens. + std::string Result = "\""; + // FIXME: Optimize this loop to not use std::strings. + bool isFirst = true; + for (; ArgToks->isNot(tok::eof); ++ArgToks) { + const Token &Tok = *ArgToks; + if (!isFirst && (Tok.hasLeadingSpace() || Tok.isAtStartOfLine())) + Result += ' '; + isFirst = false; + + // If this is a string or character constant, escape the token as specified + // by 6.10.3.2p2. + if (Tok.is(tok::string_literal) || // "foo" + Tok.is(tok::wide_string_literal) || // L"foo" + Tok.is(tok::char_constant)) { // 'x' and L'x'. + Result += Lexer::Stringify(PP.getSpelling(Tok)); + } else { + // Otherwise, just append the token. + Result += PP.getSpelling(Tok); + } + } + + // If the last character of the string is a \, and if it isn't escaped, this + // is an invalid string literal, diagnose it as specified in C99. + if (Result[Result.size()-1] == '\\') { + // Count the number of consequtive \ characters. If even, then they are + // just escaped backslashes, otherwise it's an error. + unsigned FirstNonSlash = Result.size()-2; + // Guaranteed to find the starting " if nothing else. + while (Result[FirstNonSlash] == '\\') + --FirstNonSlash; + if ((Result.size()-1-FirstNonSlash) & 1) { + // Diagnose errors for things like: #define F(X) #X / F(\) + PP.Diag(ArgToks[-1], diag::pp_invalid_string_literal); + Result.erase(Result.end()-1); // remove one of the \'s. + } + } + Result += '"'; + + // If this is the charify operation and the result is not a legal character + // constant, diagnose it. + if (Charify) { + // First step, turn double quotes into single quotes: + Result[0] = '\''; + Result[Result.size()-1] = '\''; + + // Check for bogus character. + bool isBad = false; + if (Result.size() == 3) { + isBad = Result[1] == '\''; // ''' is not legal. '\' already fixed above. + } else { + isBad = (Result.size() != 4 || Result[1] != '\\'); // Not '\x' + } + + if (isBad) { + PP.Diag(ArgTokStart[0], diag::err_invalid_character_to_charify); + Result = "' '"; // Use something arbitrary, but legal. + } + } + + Tok.setLength(Result.size()); + Tok.setLocation(PP.CreateString(&Result[0], Result.size())); + return Tok; +} + +/// getStringifiedArgument - Compute, cache, and return the specified argument +/// that has been 'stringified' as required by the # operator. +const Token &MacroArgs::getStringifiedArgument(unsigned ArgNo, + Preprocessor &PP) { + assert(ArgNo < NumUnexpArgTokens && "Invalid argument number!"); + if (StringifiedArgs.empty()) { + StringifiedArgs.resize(getNumArguments()); + memset(&StringifiedArgs[0], 0, + sizeof(StringifiedArgs[0])*getNumArguments()); + } + if (StringifiedArgs[ArgNo].isNot(tok::string_literal)) + StringifiedArgs[ArgNo] = StringifyArgument(getUnexpArgument(ArgNo), PP); + return StringifiedArgs[ArgNo]; +} diff --git a/clang/lib/Lex/MacroArgs.h b/clang/lib/Lex/MacroArgs.h new file mode 100644 index 00000000000..4b22fa18aa8 --- /dev/null +++ b/clang/lib/Lex/MacroArgs.h @@ -0,0 +1,109 @@ +//===--- MacroArgs.h - Formal argument info for Macros ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the MacroArgs interface. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_MACROARGS_H +#define LLVM_CLANG_MACROARGS_H + +#include <vector> + +namespace clang { + class MacroInfo; + class Preprocessor; + class Token; + +/// MacroArgs - An instance of this class captures information about +/// the formal arguments specified to a function-like macro invocation. +class MacroArgs { + /// NumUnexpArgTokens - The number of raw, unexpanded tokens for the + /// arguments. All of the actual argument tokens are allocated immediately + /// after the MacroArgs object in memory. This is all of the arguments + /// concatenated together, with 'EOF' markers at the end of each argument. + unsigned NumUnexpArgTokens; + + /// PreExpArgTokens - Pre-expanded tokens for arguments that need them. Empty + /// if not yet computed. This includes the EOF marker at the end of the + /// stream. + std::vector<std::vector<Token> > PreExpArgTokens; + + /// StringifiedArgs - This contains arguments in 'stringified' form. If the + /// stringified form of an argument has not yet been computed, this is empty. + std::vector<Token> StringifiedArgs; + + /// VarargsElided - True if this is a C99 style varargs macro invocation and + /// there was no argument specified for the "..." argument. If the argument + /// was specified (even empty) or this isn't a C99 style varargs function, or + /// if in strict mode and the C99 varargs macro had only a ... argument, this + /// is false. + bool VarargsElided; + + MacroArgs(unsigned NumToks, bool varargsElided) + : NumUnexpArgTokens(NumToks), VarargsElided(varargsElided) {} + ~MacroArgs() {} +public: + /// MacroArgs ctor function - Create a new MacroArgs object with the specified + /// macro and argument info. + static MacroArgs *create(const MacroInfo *MI, + const Token *UnexpArgTokens, + unsigned NumArgTokens, bool VarargsElided); + + /// destroy - Destroy and deallocate the memory for this object. + /// + void destroy(); + + /// ArgNeedsPreexpansion - If we can prove that the argument won't be affected + /// by pre-expansion, return false. Otherwise, conservatively return true. + bool ArgNeedsPreexpansion(const Token *ArgTok, Preprocessor &PP) const; + + /// getUnexpArgument - Return a pointer to the first token of the unexpanded + /// token list for the specified formal. + /// + const Token *getUnexpArgument(unsigned Arg) const; + + /// getArgLength - Given a pointer to an expanded or unexpanded argument, + /// return the number of tokens, not counting the EOF, that make up the + /// argument. + static unsigned getArgLength(const Token *ArgPtr); + + /// getPreExpArgument - Return the pre-expanded form of the specified + /// argument. + const std::vector<Token> & + getPreExpArgument(unsigned Arg, Preprocessor &PP); + + /// getStringifiedArgument - Compute, cache, and return the specified argument + /// that has been 'stringified' as required by the # operator. + const Token &getStringifiedArgument(unsigned ArgNo, Preprocessor &PP); + + /// getNumArguments - Return the number of arguments passed into this macro + /// invocation. + unsigned getNumArguments() const { return NumUnexpArgTokens; } + + + /// isVarargsElidedUse - Return true if this is a C99 style varargs macro + /// invocation and there was no argument specified for the "..." argument. If + /// the argument was specified (even empty) or this isn't a C99 style varargs + /// function, or if in strict mode and the C99 varargs macro had only a ... + /// argument, this returns false. + bool isVarargsElidedUse() const { return VarargsElided; } + + /// StringifyArgument - Implement C99 6.10.3.2p2, converting a sequence of + /// tokens into the literal string token that should be produced by the C # + /// preprocessor operator. If Charify is true, then it should be turned into + /// a character literal for the Microsoft charize (#@) extension. + /// + static Token StringifyArgument(const Token *ArgToks, + Preprocessor &PP, bool Charify = false); +}; + +} // end namespace clang + +#endif diff --git a/clang/lib/Lex/MacroInfo.cpp b/clang/lib/Lex/MacroInfo.cpp new file mode 100644 index 00000000000..de19ff502a6 --- /dev/null +++ b/clang/lib/Lex/MacroInfo.cpp @@ -0,0 +1,70 @@ +//===--- MacroInfo.cpp - Information about #defined identifiers -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the MacroInfo interface. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/MacroInfo.h" +#include "clang/Lex/Preprocessor.h" +using namespace clang; + +MacroInfo::MacroInfo(SourceLocation DefLoc) : Location(DefLoc) { + IsFunctionLike = false; + IsC99Varargs = false; + IsGNUVarargs = false; + IsBuiltinMacro = false; + IsDisabled = false; + IsUsed = true; + + ArgumentList = 0; + NumArguments = 0; +} + +/// isIdenticalTo - Return true if the specified macro definition is equal to +/// this macro in spelling, arguments, and whitespace. This is used to emit +/// duplicate definition warnings. This implements the rules in C99 6.10.3. +/// +bool MacroInfo::isIdenticalTo(const MacroInfo &Other, Preprocessor &PP) const { + // Check # tokens in replacement, number of args, and various flags all match. + if (ReplacementTokens.size() != Other.ReplacementTokens.size() || + getNumArgs() != Other.getNumArgs() || + isFunctionLike() != Other.isFunctionLike() || + isC99Varargs() != Other.isC99Varargs() || + isGNUVarargs() != Other.isGNUVarargs()) + return false; + + // Check arguments. + for (arg_iterator I = arg_begin(), OI = Other.arg_begin(), E = arg_end(); + I != E; ++I, ++OI) + if (*I != *OI) return false; + + // Check all the tokens. + for (unsigned i = 0, e = ReplacementTokens.size(); i != e; ++i) { + const Token &A = ReplacementTokens[i]; + const Token &B = Other.ReplacementTokens[i]; + if (A.getKind() != B.getKind() || + A.isAtStartOfLine() != B.isAtStartOfLine() || + A.hasLeadingSpace() != B.hasLeadingSpace()) + return false; + + // If this is an identifier, it is easy. + if (A.getIdentifierInfo() || B.getIdentifierInfo()) { + if (A.getIdentifierInfo() != B.getIdentifierInfo()) + return false; + continue; + } + + // Otherwise, check the spelling. + if (PP.getSpelling(A) != PP.getSpelling(B)) + return false; + } + + return true; +} diff --git a/clang/lib/Lex/Makefile b/clang/lib/Lex/Makefile new file mode 100644 index 00000000000..187448c9922 --- /dev/null +++ b/clang/lib/Lex/Makefile @@ -0,0 +1,28 @@ +##===- clang/lib/Lex/Makefile ------------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +# +# This implements the Lexer library for the C-Language front-end. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../../.. +include $(LEVEL)/Makefile.config + +LIBRARYNAME := clangLex +BUILD_ARCHIVE = 1 +CXXFLAGS = -fno-rtti + +ifeq ($(ARCH),PowerPC) +CXXFLAGS += -maltivec +endif + +CPPFLAGS += -I$(PROJ_SRC_DIR)/../../include + +include $(LEVEL)/Makefile.common + diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp new file mode 100644 index 00000000000..b24f5b63760 --- /dev/null +++ b/clang/lib/Lex/PPDirectives.cpp @@ -0,0 +1,1153 @@ +//===--- PPDirectives.cpp - Directive Handling for Preprocessor -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements # directive processing for the Preprocessor. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Lex/PPCallbacks.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/SourceManager.h" +using namespace clang; + +//===----------------------------------------------------------------------===// +// Utility Methods for Preprocessor Directive Handling. +//===----------------------------------------------------------------------===// + +/// DiscardUntilEndOfDirective - Read and discard all tokens remaining on the +/// current line until the tok::eom token is found. +void Preprocessor::DiscardUntilEndOfDirective() { + Token Tmp; + do { + LexUnexpandedToken(Tmp); + } while (Tmp.isNot(tok::eom)); +} + +/// isCXXNamedOperator - Returns "true" if the token is a named operator in C++. +static bool isCXXNamedOperator(const std::string &Spelling) { + return Spelling == "and" || Spelling == "bitand" || Spelling == "bitor" || + Spelling == "compl" || Spelling == "not" || Spelling == "not_eq" || + Spelling == "or" || Spelling == "xor"; +} + +/// ReadMacroName - Lex and validate a macro name, which occurs after a +/// #define or #undef. This sets the token kind to eom and discards the rest +/// of the macro line if the macro name is invalid. isDefineUndef is 1 if +/// this is due to a a #define, 2 if #undef directive, 0 if it is something +/// else (e.g. #ifdef). +void Preprocessor::ReadMacroName(Token &MacroNameTok, char isDefineUndef) { + // Read the token, don't allow macro expansion on it. + LexUnexpandedToken(MacroNameTok); + + // Missing macro name? + if (MacroNameTok.is(tok::eom)) + return Diag(MacroNameTok, diag::err_pp_missing_macro_name); + + IdentifierInfo *II = MacroNameTok.getIdentifierInfo(); + if (II == 0) { + std::string Spelling = getSpelling(MacroNameTok); + if (isCXXNamedOperator(Spelling)) + // C++ 2.5p2: Alternative tokens behave the same as its primary token + // except for their spellings. + Diag(MacroNameTok, diag::err_pp_operator_used_as_macro_name, Spelling); + else + Diag(MacroNameTok, diag::err_pp_macro_not_identifier); + // Fall through on error. + } else if (isDefineUndef && II->getPPKeywordID() == tok::pp_defined) { + // Error if defining "defined": C99 6.10.8.4. + Diag(MacroNameTok, diag::err_defined_macro_name); + } else if (isDefineUndef && II->hasMacroDefinition() && + getMacroInfo(II)->isBuiltinMacro()) { + // Error if defining "__LINE__" and other builtins: C99 6.10.8.4. + if (isDefineUndef == 1) + Diag(MacroNameTok, diag::pp_redef_builtin_macro); + else + Diag(MacroNameTok, diag::pp_undef_builtin_macro); + } else { + // Okay, we got a good identifier node. Return it. + return; + } + + // Invalid macro name, read and discard the rest of the line. Then set the + // token kind to tok::eom. + MacroNameTok.setKind(tok::eom); + return DiscardUntilEndOfDirective(); +} + +/// CheckEndOfDirective - Ensure that the next token is a tok::eom token. If +/// not, emit a diagnostic and consume up until the eom. +void Preprocessor::CheckEndOfDirective(const char *DirType) { + Token Tmp; + // Lex unexpanded tokens: macros might expand to zero tokens, causing us to + // miss diagnosing invalid lines. + LexUnexpandedToken(Tmp); + + // There should be no tokens after the directive, but we allow them as an + // extension. + while (Tmp.is(tok::comment)) // Skip comments in -C mode. + LexUnexpandedToken(Tmp); + + if (Tmp.isNot(tok::eom)) { + Diag(Tmp, diag::ext_pp_extra_tokens_at_eol, DirType); + DiscardUntilEndOfDirective(); + } +} + + + +/// SkipExcludedConditionalBlock - We just read a #if or related directive and +/// decided that the subsequent tokens are in the #if'd out portion of the +/// file. Lex the rest of the file, until we see an #endif. If +/// FoundNonSkipPortion is true, then we have already emitted code for part of +/// this #if directive, so #else/#elif blocks should never be entered. If ElseOk +/// is true, then #else directives are ok, if not, then we have already seen one +/// so a #else directive is a duplicate. When this returns, the caller can lex +/// the first valid token. +void Preprocessor::SkipExcludedConditionalBlock(SourceLocation IfTokenLoc, + bool FoundNonSkipPortion, + bool FoundElse) { + ++NumSkipped; + assert(CurTokenLexer == 0 && CurLexer && + "Lexing a macro, not a file?"); + + CurLexer->pushConditionalLevel(IfTokenLoc, /*isSkipping*/false, + FoundNonSkipPortion, FoundElse); + + // Enter raw mode to disable identifier lookup (and thus macro expansion), + // disabling warnings, etc. + CurLexer->LexingRawMode = true; + Token Tok; + while (1) { + CurLexer->Lex(Tok); + + // If this is the end of the buffer, we have an error. + if (Tok.is(tok::eof)) { + // Emit errors for each unterminated conditional on the stack, including + // the current one. + while (!CurLexer->ConditionalStack.empty()) { + Diag(CurLexer->ConditionalStack.back().IfLoc, + diag::err_pp_unterminated_conditional); + CurLexer->ConditionalStack.pop_back(); + } + + // Just return and let the caller lex after this #include. + break; + } + + // If this token is not a preprocessor directive, just skip it. + if (Tok.isNot(tok::hash) || !Tok.isAtStartOfLine()) + continue; + + // We just parsed a # character at the start of a line, so we're in + // directive mode. Tell the lexer this so any newlines we see will be + // converted into an EOM token (this terminates the macro). + CurLexer->ParsingPreprocessorDirective = true; + CurLexer->KeepCommentMode = false; + + + // Read the next token, the directive flavor. + LexUnexpandedToken(Tok); + + // If this isn't an identifier directive (e.g. is "# 1\n" or "#\n", or + // something bogus), skip it. + if (Tok.isNot(tok::identifier)) { + CurLexer->ParsingPreprocessorDirective = false; + // Restore comment saving mode. + CurLexer->KeepCommentMode = KeepComments; + continue; + } + + // If the first letter isn't i or e, it isn't intesting to us. We know that + // this is safe in the face of spelling differences, because there is no way + // to spell an i/e in a strange way that is another letter. Skipping this + // allows us to avoid looking up the identifier info for #define/#undef and + // other common directives. + const char *RawCharData = SourceMgr.getCharacterData(Tok.getLocation()); + char FirstChar = RawCharData[0]; + if (FirstChar >= 'a' && FirstChar <= 'z' && + FirstChar != 'i' && FirstChar != 'e') { + CurLexer->ParsingPreprocessorDirective = false; + // Restore comment saving mode. + CurLexer->KeepCommentMode = KeepComments; + continue; + } + + // Get the identifier name without trigraphs or embedded newlines. Note + // that we can't use Tok.getIdentifierInfo() because its lookup is disabled + // when skipping. + // TODO: could do this with zero copies in the no-clean case by using + // strncmp below. + char Directive[20]; + unsigned IdLen; + if (!Tok.needsCleaning() && Tok.getLength() < 20) { + IdLen = Tok.getLength(); + memcpy(Directive, RawCharData, IdLen); + Directive[IdLen] = 0; + } else { + std::string DirectiveStr = getSpelling(Tok); + IdLen = DirectiveStr.size(); + if (IdLen >= 20) { + CurLexer->ParsingPreprocessorDirective = false; + // Restore comment saving mode. + CurLexer->KeepCommentMode = KeepComments; + continue; + } + memcpy(Directive, &DirectiveStr[0], IdLen); + Directive[IdLen] = 0; + } + + if (FirstChar == 'i' && Directive[1] == 'f') { + if ((IdLen == 2) || // "if" + (IdLen == 5 && !strcmp(Directive+2, "def")) || // "ifdef" + (IdLen == 6 && !strcmp(Directive+2, "ndef"))) { // "ifndef" + // We know the entire #if/#ifdef/#ifndef block will be skipped, don't + // bother parsing the condition. + DiscardUntilEndOfDirective(); + CurLexer->pushConditionalLevel(Tok.getLocation(), /*wasskipping*/true, + /*foundnonskip*/false, + /*fnddelse*/false); + } + } else if (FirstChar == 'e') { + if (IdLen == 5 && !strcmp(Directive+1, "ndif")) { // "endif" + CheckEndOfDirective("#endif"); + PPConditionalInfo CondInfo; + CondInfo.WasSkipping = true; // Silence bogus warning. + bool InCond = CurLexer->popConditionalLevel(CondInfo); + InCond = InCond; // Silence warning in no-asserts mode. + assert(!InCond && "Can't be skipping if not in a conditional!"); + + // If we popped the outermost skipping block, we're done skipping! + if (!CondInfo.WasSkipping) + break; + } else if (IdLen == 4 && !strcmp(Directive+1, "lse")) { // "else". + // #else directive in a skipping conditional. If not in some other + // skipping conditional, and if #else hasn't already been seen, enter it + // as a non-skipping conditional. + CheckEndOfDirective("#else"); + PPConditionalInfo &CondInfo = CurLexer->peekConditionalLevel(); + + // If this is a #else with a #else before it, report the error. + if (CondInfo.FoundElse) Diag(Tok, diag::pp_err_else_after_else); + + // Note that we've seen a #else in this conditional. + CondInfo.FoundElse = true; + + // If the conditional is at the top level, and the #if block wasn't + // entered, enter the #else block now. + if (!CondInfo.WasSkipping && !CondInfo.FoundNonSkip) { + CondInfo.FoundNonSkip = true; + break; + } + } else if (IdLen == 4 && !strcmp(Directive+1, "lif")) { // "elif". + PPConditionalInfo &CondInfo = CurLexer->peekConditionalLevel(); + + bool ShouldEnter; + // If this is in a skipping block or if we're already handled this #if + // block, don't bother parsing the condition. + if (CondInfo.WasSkipping || CondInfo.FoundNonSkip) { + DiscardUntilEndOfDirective(); + ShouldEnter = false; + } else { + // Restore the value of LexingRawMode so that identifiers are + // looked up, etc, inside the #elif expression. + assert(CurLexer->LexingRawMode && "We have to be skipping here!"); + CurLexer->LexingRawMode = false; + IdentifierInfo *IfNDefMacro = 0; + ShouldEnter = EvaluateDirectiveExpression(IfNDefMacro); + CurLexer->LexingRawMode = true; + } + + // If this is a #elif with a #else before it, report the error. + if (CondInfo.FoundElse) Diag(Tok, diag::pp_err_elif_after_else); + + // If this condition is true, enter it! + if (ShouldEnter) { + CondInfo.FoundNonSkip = true; + break; + } + } + } + + CurLexer->ParsingPreprocessorDirective = false; + // Restore comment saving mode. + CurLexer->KeepCommentMode = KeepComments; + } + + // Finally, if we are out of the conditional (saw an #endif or ran off the end + // of the file, just stop skipping and return to lexing whatever came after + // the #if block. + CurLexer->LexingRawMode = false; +} + +/// LookupFile - Given a "foo" or <foo> reference, look up the indicated file, +/// return null on failure. isAngled indicates whether the file reference is +/// for system #include's or not (i.e. using <> instead of ""). +const FileEntry *Preprocessor::LookupFile(const char *FilenameStart, + const char *FilenameEnd, + bool isAngled, + const DirectoryLookup *FromDir, + const DirectoryLookup *&CurDir) { + // If the header lookup mechanism may be relative to the current file, pass in + // info about where the current file is. + const FileEntry *CurFileEnt = 0; + if (!FromDir) { + SourceLocation FileLoc = getCurrentFileLexer()->getFileLoc(); + CurFileEnt = SourceMgr.getFileEntryForLoc(FileLoc); + } + + // Do a standard file entry lookup. + CurDir = CurDirLookup; + const FileEntry *FE = + HeaderInfo.LookupFile(FilenameStart, FilenameEnd, + isAngled, FromDir, CurDir, CurFileEnt); + if (FE) return FE; + + // Otherwise, see if this is a subframework header. If so, this is relative + // to one of the headers on the #include stack. Walk the list of the current + // headers on the #include stack and pass them to HeaderInfo. + if (CurLexer && !CurLexer->Is_PragmaLexer) { + if ((CurFileEnt = SourceMgr.getFileEntryForLoc(CurLexer->getFileLoc()))) + if ((FE = HeaderInfo.LookupSubframeworkHeader(FilenameStart, FilenameEnd, + CurFileEnt))) + return FE; + } + + for (unsigned i = 0, e = IncludeMacroStack.size(); i != e; ++i) { + IncludeStackInfo &ISEntry = IncludeMacroStack[e-i-1]; + if (ISEntry.TheLexer && !ISEntry.TheLexer->Is_PragmaLexer) { + if ((CurFileEnt = + SourceMgr.getFileEntryForLoc(ISEntry.TheLexer->getFileLoc()))) + if ((FE = HeaderInfo.LookupSubframeworkHeader(FilenameStart, + FilenameEnd, CurFileEnt))) + return FE; + } + } + + // Otherwise, we really couldn't find the file. + return 0; +} + + +//===----------------------------------------------------------------------===// +// Preprocessor Directive Handling. +//===----------------------------------------------------------------------===// + +/// HandleDirective - This callback is invoked when the lexer sees a # token +/// at the start of a line. This consumes the directive, modifies the +/// lexer/preprocessor state, and advances the lexer(s) so that the next token +/// read is the correct one. +void Preprocessor::HandleDirective(Token &Result) { + // FIXME: Traditional: # with whitespace before it not recognized by K&R? + + // We just parsed a # character at the start of a line, so we're in directive + // mode. Tell the lexer this so any newlines we see will be converted into an + // EOM token (which terminates the directive). + CurLexer->ParsingPreprocessorDirective = true; + + ++NumDirectives; + + // We are about to read a token. For the multiple-include optimization FA to + // work, we have to remember if we had read any tokens *before* this + // pp-directive. + bool ReadAnyTokensBeforeDirective = CurLexer->MIOpt.getHasReadAnyTokensVal(); + + // Read the next token, the directive flavor. This isn't expanded due to + // C99 6.10.3p8. + LexUnexpandedToken(Result); + + // C99 6.10.3p11: Is this preprocessor directive in macro invocation? e.g.: + // #define A(x) #x + // A(abc + // #warning blah + // def) + // If so, the user is relying on non-portable behavior, emit a diagnostic. + if (InMacroArgs) + Diag(Result, diag::ext_embedded_directive); + +TryAgain: + switch (Result.getKind()) { + case tok::eom: + return; // null directive. + case tok::comment: + // Handle stuff like "# /*foo*/ define X" in -E -C mode. + LexUnexpandedToken(Result); + goto TryAgain; + + case tok::numeric_constant: + // FIXME: implement # 7 line numbers! + DiscardUntilEndOfDirective(); + return; + default: + IdentifierInfo *II = Result.getIdentifierInfo(); + if (II == 0) break; // Not an identifier. + + // Ask what the preprocessor keyword ID is. + switch (II->getPPKeywordID()) { + default: break; + // C99 6.10.1 - Conditional Inclusion. + case tok::pp_if: + return HandleIfDirective(Result, ReadAnyTokensBeforeDirective); + case tok::pp_ifdef: + return HandleIfdefDirective(Result, false, true/*not valid for miopt*/); + case tok::pp_ifndef: + return HandleIfdefDirective(Result, true, ReadAnyTokensBeforeDirective); + case tok::pp_elif: + return HandleElifDirective(Result); + case tok::pp_else: + return HandleElseDirective(Result); + case tok::pp_endif: + return HandleEndifDirective(Result); + + // C99 6.10.2 - Source File Inclusion. + case tok::pp_include: + return HandleIncludeDirective(Result); // Handle #include. + + // C99 6.10.3 - Macro Replacement. + case tok::pp_define: + return HandleDefineDirective(Result); + case tok::pp_undef: + return HandleUndefDirective(Result); + + // C99 6.10.4 - Line Control. + case tok::pp_line: + // FIXME: implement #line + DiscardUntilEndOfDirective(); + return; + + // C99 6.10.5 - Error Directive. + case tok::pp_error: + return HandleUserDiagnosticDirective(Result, false); + + // C99 6.10.6 - Pragma Directive. + case tok::pp_pragma: + return HandlePragmaDirective(); + + // GNU Extensions. + case tok::pp_import: + return HandleImportDirective(Result); + case tok::pp_include_next: + return HandleIncludeNextDirective(Result); + + case tok::pp_warning: + Diag(Result, diag::ext_pp_warning_directive); + return HandleUserDiagnosticDirective(Result, true); + case tok::pp_ident: + return HandleIdentSCCSDirective(Result); + case tok::pp_sccs: + return HandleIdentSCCSDirective(Result); + case tok::pp_assert: + //isExtension = true; // FIXME: implement #assert + break; + case tok::pp_unassert: + //isExtension = true; // FIXME: implement #unassert + break; + } + break; + } + + // If we reached here, the preprocessing token is not valid! + Diag(Result, diag::err_pp_invalid_directive); + + // Read the rest of the PP line. + DiscardUntilEndOfDirective(); + + // Okay, we're done parsing the directive. +} + +void Preprocessor::HandleUserDiagnosticDirective(Token &Tok, + bool isWarning) { + // Read the rest of the line raw. We do this because we don't want macros + // to be expanded and we don't require that the tokens be valid preprocessing + // tokens. For example, this is allowed: "#warning ` 'foo". GCC does + // collapse multiple consequtive white space between tokens, but this isn't + // specified by the standard. + std::string Message = CurLexer->ReadToEndOfLine(); + + unsigned DiagID = isWarning ? diag::pp_hash_warning : diag::err_pp_hash_error; + return Diag(Tok, DiagID, Message); +} + +/// HandleIdentSCCSDirective - Handle a #ident/#sccs directive. +/// +void Preprocessor::HandleIdentSCCSDirective(Token &Tok) { + // Yes, this directive is an extension. + Diag(Tok, diag::ext_pp_ident_directive); + + // Read the string argument. + Token StrTok; + Lex(StrTok); + + // If the token kind isn't a string, it's a malformed directive. + if (StrTok.isNot(tok::string_literal) && + StrTok.isNot(tok::wide_string_literal)) + return Diag(StrTok, diag::err_pp_malformed_ident); + + // Verify that there is nothing after the string, other than EOM. + CheckEndOfDirective("#ident"); + + if (Callbacks) + Callbacks->Ident(Tok.getLocation(), getSpelling(StrTok)); +} + +//===----------------------------------------------------------------------===// +// Preprocessor Include Directive Handling. +//===----------------------------------------------------------------------===// + +/// GetIncludeFilenameSpelling - Turn the specified lexer token into a fully +/// checked and spelled filename, e.g. as an operand of #include. This returns +/// true if the input filename was in <>'s or false if it were in ""'s. The +/// caller is expected to provide a buffer that is large enough to hold the +/// spelling of the filename, but is also expected to handle the case when +/// this method decides to use a different buffer. +bool Preprocessor::GetIncludeFilenameSpelling(SourceLocation Loc, + const char *&BufStart, + const char *&BufEnd) { + // Get the text form of the filename. + assert(BufStart != BufEnd && "Can't have tokens with empty spellings!"); + + // Make sure the filename is <x> or "x". + bool isAngled; + if (BufStart[0] == '<') { + if (BufEnd[-1] != '>') { + Diag(Loc, diag::err_pp_expects_filename); + BufStart = 0; + return true; + } + isAngled = true; + } else if (BufStart[0] == '"') { + if (BufEnd[-1] != '"') { + Diag(Loc, diag::err_pp_expects_filename); + BufStart = 0; + return true; + } + isAngled = false; + } else { + Diag(Loc, diag::err_pp_expects_filename); + BufStart = 0; + return true; + } + + // Diagnose #include "" as invalid. + if (BufEnd-BufStart <= 2) { + Diag(Loc, diag::err_pp_empty_filename); + BufStart = 0; + return ""; + } + + // Skip the brackets. + ++BufStart; + --BufEnd; + return isAngled; +} + +/// ConcatenateIncludeName - Handle cases where the #include name is expanded +/// from a macro as multiple tokens, which need to be glued together. This +/// occurs for code like: +/// #define FOO <a/b.h> +/// #include FOO +/// because in this case, "<a/b.h>" is returned as 7 tokens, not one. +/// +/// This code concatenates and consumes tokens up to the '>' token. It returns +/// false if the > was found, otherwise it returns true if it finds and consumes +/// the EOM marker. +static bool ConcatenateIncludeName(llvm::SmallVector<char, 128> &FilenameBuffer, + Preprocessor &PP) { + Token CurTok; + + PP.Lex(CurTok); + while (CurTok.isNot(tok::eom)) { + // Append the spelling of this token to the buffer. If there was a space + // before it, add it now. + if (CurTok.hasLeadingSpace()) + FilenameBuffer.push_back(' '); + + // Get the spelling of the token, directly into FilenameBuffer if possible. + unsigned PreAppendSize = FilenameBuffer.size(); + FilenameBuffer.resize(PreAppendSize+CurTok.getLength()); + + const char *BufPtr = &FilenameBuffer[PreAppendSize]; + unsigned ActualLen = PP.getSpelling(CurTok, BufPtr); + + // If the token was spelled somewhere else, copy it into FilenameBuffer. + if (BufPtr != &FilenameBuffer[PreAppendSize]) + memcpy(&FilenameBuffer[PreAppendSize], BufPtr, ActualLen); + + // Resize FilenameBuffer to the correct size. + if (CurTok.getLength() != ActualLen) + FilenameBuffer.resize(PreAppendSize+ActualLen); + + // If we found the '>' marker, return success. + if (CurTok.is(tok::greater)) + return false; + + PP.Lex(CurTok); + } + + // If we hit the eom marker, emit an error and return true so that the caller + // knows the EOM has been read. + PP.Diag(CurTok.getLocation(), diag::err_pp_expects_filename); + return true; +} + +/// HandleIncludeDirective - The "#include" tokens have just been read, read the +/// file to be included from the lexer, then include it! This is a common +/// routine with functionality shared between #include, #include_next and +/// #import. +void Preprocessor::HandleIncludeDirective(Token &IncludeTok, + const DirectoryLookup *LookupFrom, + bool isImport) { + + Token FilenameTok; + CurLexer->LexIncludeFilename(FilenameTok); + + // Reserve a buffer to get the spelling. + llvm::SmallVector<char, 128> FilenameBuffer; + const char *FilenameStart, *FilenameEnd; + + switch (FilenameTok.getKind()) { + case tok::eom: + // If the token kind is EOM, the error has already been diagnosed. + return; + + case tok::angle_string_literal: + case tok::string_literal: { + FilenameBuffer.resize(FilenameTok.getLength()); + FilenameStart = &FilenameBuffer[0]; + unsigned Len = getSpelling(FilenameTok, FilenameStart); + FilenameEnd = FilenameStart+Len; + break; + } + + case tok::less: + // This could be a <foo/bar.h> file coming from a macro expansion. In this + // case, glue the tokens together into FilenameBuffer and interpret those. + FilenameBuffer.push_back('<'); + if (ConcatenateIncludeName(FilenameBuffer, *this)) + return; // Found <eom> but no ">"? Diagnostic already emitted. + FilenameStart = &FilenameBuffer[0]; + FilenameEnd = &FilenameBuffer[FilenameBuffer.size()]; + break; + default: + Diag(FilenameTok.getLocation(), diag::err_pp_expects_filename); + DiscardUntilEndOfDirective(); + return; + } + + bool isAngled = GetIncludeFilenameSpelling(FilenameTok.getLocation(), + FilenameStart, FilenameEnd); + // If GetIncludeFilenameSpelling set the start ptr to null, there was an + // error. + if (FilenameStart == 0) { + DiscardUntilEndOfDirective(); + return; + } + + // Verify that there is nothing after the filename, other than EOM. Use the + // preprocessor to lex this in case lexing the filename entered a macro. + CheckEndOfDirective("#include"); + + // Check that we don't have infinite #include recursion. + if (IncludeMacroStack.size() == MaxAllowedIncludeStackDepth-1) + return Diag(FilenameTok, diag::err_pp_include_too_deep); + + // Search include directories. + const DirectoryLookup *CurDir; + const FileEntry *File = LookupFile(FilenameStart, FilenameEnd, + isAngled, LookupFrom, CurDir); + if (File == 0) + return Diag(FilenameTok, diag::err_pp_file_not_found, + std::string(FilenameStart, FilenameEnd)); + + // Ask HeaderInfo if we should enter this #include file. + if (!HeaderInfo.ShouldEnterIncludeFile(File, isImport)) { + // If it returns true, #including this file will have no effect. + return; + } + + // Look up the file, create a File ID for it. + unsigned FileID = SourceMgr.createFileID(File, FilenameTok.getLocation()); + if (FileID == 0) + return Diag(FilenameTok, diag::err_pp_file_not_found, + std::string(FilenameStart, FilenameEnd)); + + // Finally, if all is good, enter the new file! + EnterSourceFile(FileID, CurDir); +} + +/// HandleIncludeNextDirective - Implements #include_next. +/// +void Preprocessor::HandleIncludeNextDirective(Token &IncludeNextTok) { + Diag(IncludeNextTok, diag::ext_pp_include_next_directive); + + // #include_next is like #include, except that we start searching after + // the current found directory. If we can't do this, issue a + // diagnostic. + const DirectoryLookup *Lookup = CurDirLookup; + if (isInPrimaryFile()) { + Lookup = 0; + Diag(IncludeNextTok, diag::pp_include_next_in_primary); + } else if (Lookup == 0) { + Diag(IncludeNextTok, diag::pp_include_next_absolute_path); + } else { + // Start looking up in the next directory. + ++Lookup; + } + + return HandleIncludeDirective(IncludeNextTok, Lookup); +} + +/// HandleImportDirective - Implements #import. +/// +void Preprocessor::HandleImportDirective(Token &ImportTok) { + Diag(ImportTok, diag::ext_pp_import_directive); + + return HandleIncludeDirective(ImportTok, 0, true); +} + +//===----------------------------------------------------------------------===// +// Preprocessor Macro Directive Handling. +//===----------------------------------------------------------------------===// + +/// ReadMacroDefinitionArgList - The ( starting an argument list of a macro +/// definition has just been read. Lex the rest of the arguments and the +/// closing ), updating MI with what we learn. Return true if an error occurs +/// parsing the arg list. +bool Preprocessor::ReadMacroDefinitionArgList(MacroInfo *MI) { + llvm::SmallVector<IdentifierInfo*, 32> Arguments; + + Token Tok; + while (1) { + LexUnexpandedToken(Tok); + switch (Tok.getKind()) { + case tok::r_paren: + // Found the end of the argument list. + if (Arguments.empty()) { // #define FOO() + MI->setArgumentList(Arguments.begin(), Arguments.end()); + return false; + } + // Otherwise we have #define FOO(A,) + Diag(Tok, diag::err_pp_expected_ident_in_arg_list); + return true; + case tok::ellipsis: // #define X(... -> C99 varargs + // Warn if use of C99 feature in non-C99 mode. + if (!Features.C99) Diag(Tok, diag::ext_variadic_macro); + + // Lex the token after the identifier. + LexUnexpandedToken(Tok); + if (Tok.isNot(tok::r_paren)) { + Diag(Tok, diag::err_pp_missing_rparen_in_macro_def); + return true; + } + // Add the __VA_ARGS__ identifier as an argument. + Arguments.push_back(Ident__VA_ARGS__); + MI->setIsC99Varargs(); + MI->setArgumentList(Arguments.begin(), Arguments.end()); + return false; + case tok::eom: // #define X( + Diag(Tok, diag::err_pp_missing_rparen_in_macro_def); + return true; + default: + // Handle keywords and identifiers here to accept things like + // #define Foo(for) for. + IdentifierInfo *II = Tok.getIdentifierInfo(); + if (II == 0) { + // #define X(1 + Diag(Tok, diag::err_pp_invalid_tok_in_arg_list); + return true; + } + + // If this is already used as an argument, it is used multiple times (e.g. + // #define X(A,A. + if (std::find(Arguments.begin(), Arguments.end(), II) != + Arguments.end()) { // C99 6.10.3p6 + Diag(Tok, diag::err_pp_duplicate_name_in_arg_list, II->getName()); + return true; + } + + // Add the argument to the macro info. + Arguments.push_back(II); + + // Lex the token after the identifier. + LexUnexpandedToken(Tok); + + switch (Tok.getKind()) { + default: // #define X(A B + Diag(Tok, diag::err_pp_expected_comma_in_arg_list); + return true; + case tok::r_paren: // #define X(A) + MI->setArgumentList(Arguments.begin(), Arguments.end()); + return false; + case tok::comma: // #define X(A, + break; + case tok::ellipsis: // #define X(A... -> GCC extension + // Diagnose extension. + Diag(Tok, diag::ext_named_variadic_macro); + + // Lex the token after the identifier. + LexUnexpandedToken(Tok); + if (Tok.isNot(tok::r_paren)) { + Diag(Tok, diag::err_pp_missing_rparen_in_macro_def); + return true; + } + + MI->setIsGNUVarargs(); + MI->setArgumentList(Arguments.begin(), Arguments.end()); + return false; + } + } + } +} + +/// HandleDefineDirective - Implements #define. This consumes the entire macro +/// line then lets the caller lex the next real token. +void Preprocessor::HandleDefineDirective(Token &DefineTok) { + ++NumDefined; + + Token MacroNameTok; + ReadMacroName(MacroNameTok, 1); + + // Error reading macro name? If so, diagnostic already issued. + if (MacroNameTok.is(tok::eom)) + return; + + // If we are supposed to keep comments in #defines, reenable comment saving + // mode. + CurLexer->KeepCommentMode = KeepMacroComments; + + // Create the new macro. + MacroInfo *MI = new MacroInfo(MacroNameTok.getLocation()); + + Token Tok; + LexUnexpandedToken(Tok); + + // If this is a function-like macro definition, parse the argument list, + // marking each of the identifiers as being used as macro arguments. Also, + // check other constraints on the first token of the macro body. + if (Tok.is(tok::eom)) { + // If there is no body to this macro, we have no special handling here. + } else if (Tok.is(tok::l_paren) && !Tok.hasLeadingSpace()) { + // This is a function-like macro definition. Read the argument list. + MI->setIsFunctionLike(); + if (ReadMacroDefinitionArgList(MI)) { + // Forget about MI. + delete MI; + // Throw away the rest of the line. + if (CurLexer->ParsingPreprocessorDirective) + DiscardUntilEndOfDirective(); + return; + } + + // Read the first token after the arg list for down below. + LexUnexpandedToken(Tok); + } else if (!Tok.hasLeadingSpace()) { + // C99 requires whitespace between the macro definition and the body. Emit + // a diagnostic for something like "#define X+". + if (Features.C99) { + Diag(Tok, diag::ext_c99_whitespace_required_after_macro_name); + } else { + // FIXME: C90/C++ do not get this diagnostic, but it does get a similar + // one in some cases! + } + } else { + // This is a normal token with leading space. Clear the leading space + // marker on the first token to get proper expansion. + Tok.clearFlag(Token::LeadingSpace); + } + + // If this is a definition of a variadic C99 function-like macro, not using + // the GNU named varargs extension, enabled __VA_ARGS__. + + // "Poison" __VA_ARGS__, which can only appear in the expansion of a macro. + // This gets unpoisoned where it is allowed. + assert(Ident__VA_ARGS__->isPoisoned() && "__VA_ARGS__ should be poisoned!"); + if (MI->isC99Varargs()) + Ident__VA_ARGS__->setIsPoisoned(false); + + // Read the rest of the macro body. + if (MI->isObjectLike()) { + // Object-like macros are very simple, just read their body. + while (Tok.isNot(tok::eom)) { + MI->AddTokenToBody(Tok); + // Get the next token of the macro. + LexUnexpandedToken(Tok); + } + + } else { + // Otherwise, read the body of a function-like macro. This has to validate + // the # (stringize) operator. + while (Tok.isNot(tok::eom)) { + MI->AddTokenToBody(Tok); + + // Check C99 6.10.3.2p1: ensure that # operators are followed by macro + // parameters in function-like macro expansions. + if (Tok.isNot(tok::hash)) { + // Get the next token of the macro. + LexUnexpandedToken(Tok); + continue; + } + + // Get the next token of the macro. + LexUnexpandedToken(Tok); + + // Not a macro arg identifier? + if (!Tok.getIdentifierInfo() || + MI->getArgumentNum(Tok.getIdentifierInfo()) == -1) { + Diag(Tok, diag::err_pp_stringize_not_parameter); + delete MI; + + // Disable __VA_ARGS__ again. + Ident__VA_ARGS__->setIsPoisoned(true); + return; + } + + // Things look ok, add the param name token to the macro. + MI->AddTokenToBody(Tok); + + // Get the next token of the macro. + LexUnexpandedToken(Tok); + } + } + + + // Disable __VA_ARGS__ again. + Ident__VA_ARGS__->setIsPoisoned(true); + + // Check that there is no paste (##) operator at the begining or end of the + // replacement list. + unsigned NumTokens = MI->getNumTokens(); + if (NumTokens != 0) { + if (MI->getReplacementToken(0).is(tok::hashhash)) { + Diag(MI->getReplacementToken(0), diag::err_paste_at_start); + delete MI; + return; + } + if (MI->getReplacementToken(NumTokens-1).is(tok::hashhash)) { + Diag(MI->getReplacementToken(NumTokens-1), diag::err_paste_at_end); + delete MI; + return; + } + } + + // If this is the primary source file, remember that this macro hasn't been + // used yet. + if (isInPrimaryFile()) + MI->setIsUsed(false); + + // Finally, if this identifier already had a macro defined for it, verify that + // the macro bodies are identical and free the old definition. + if (MacroInfo *OtherMI = getMacroInfo(MacroNameTok.getIdentifierInfo())) { + if (!OtherMI->isUsed()) + Diag(OtherMI->getDefinitionLoc(), diag::pp_macro_not_used); + + // Macros must be identical. This means all tokes and whitespace separation + // must be the same. C99 6.10.3.2. + if (!MI->isIdenticalTo(*OtherMI, *this)) { + Diag(MI->getDefinitionLoc(), diag::ext_pp_macro_redef, + MacroNameTok.getIdentifierInfo()->getName()); + Diag(OtherMI->getDefinitionLoc(), diag::ext_pp_macro_redef2); + } + delete OtherMI; + } + + setMacroInfo(MacroNameTok.getIdentifierInfo(), MI); +} + +/// HandleUndefDirective - Implements #undef. +/// +void Preprocessor::HandleUndefDirective(Token &UndefTok) { + ++NumUndefined; + + Token MacroNameTok; + ReadMacroName(MacroNameTok, 2); + + // Error reading macro name? If so, diagnostic already issued. + if (MacroNameTok.is(tok::eom)) + return; + + // Check to see if this is the last token on the #undef line. + CheckEndOfDirective("#undef"); + + // Okay, we finally have a valid identifier to undef. + MacroInfo *MI = getMacroInfo(MacroNameTok.getIdentifierInfo()); + + // If the macro is not defined, this is a noop undef, just return. + if (MI == 0) return; + + if (!MI->isUsed()) + Diag(MI->getDefinitionLoc(), diag::pp_macro_not_used); + + // Free macro definition. + delete MI; + setMacroInfo(MacroNameTok.getIdentifierInfo(), 0); +} + + +//===----------------------------------------------------------------------===// +// Preprocessor Conditional Directive Handling. +//===----------------------------------------------------------------------===// + +/// HandleIfdefDirective - Implements the #ifdef/#ifndef directive. isIfndef is +/// true when this is a #ifndef directive. ReadAnyTokensBeforeDirective is true +/// if any tokens have been returned or pp-directives activated before this +/// #ifndef has been lexed. +/// +void Preprocessor::HandleIfdefDirective(Token &Result, bool isIfndef, + bool ReadAnyTokensBeforeDirective) { + ++NumIf; + Token DirectiveTok = Result; + + Token MacroNameTok; + ReadMacroName(MacroNameTok); + + // Error reading macro name? If so, diagnostic already issued. + if (MacroNameTok.is(tok::eom)) { + // Skip code until we get to #endif. This helps with recovery by not + // emitting an error when the #endif is reached. + SkipExcludedConditionalBlock(DirectiveTok.getLocation(), + /*Foundnonskip*/false, /*FoundElse*/false); + return; + } + + // Check to see if this is the last token on the #if[n]def line. + CheckEndOfDirective(isIfndef ? "#ifndef" : "#ifdef"); + + if (CurLexer->getConditionalStackDepth() == 0) { + // If the start of a top-level #ifdef, inform MIOpt. + if (!ReadAnyTokensBeforeDirective) { + assert(isIfndef && "#ifdef shouldn't reach here"); + CurLexer->MIOpt.EnterTopLevelIFNDEF(MacroNameTok.getIdentifierInfo()); + } else + CurLexer->MIOpt.EnterTopLevelConditional(); + } + + IdentifierInfo *MII = MacroNameTok.getIdentifierInfo(); + MacroInfo *MI = getMacroInfo(MII); + + // If there is a macro, process it. + if (MI) // Mark it used. + MI->setIsUsed(true); + + // Should we include the stuff contained by this directive? + if (!MI == isIfndef) { + // Yes, remember that we are inside a conditional, then lex the next token. + CurLexer->pushConditionalLevel(DirectiveTok.getLocation(), /*wasskip*/false, + /*foundnonskip*/true, /*foundelse*/false); + } else { + // No, skip the contents of this block and return the first token after it. + SkipExcludedConditionalBlock(DirectiveTok.getLocation(), + /*Foundnonskip*/false, + /*FoundElse*/false); + } +} + +/// HandleIfDirective - Implements the #if directive. +/// +void Preprocessor::HandleIfDirective(Token &IfToken, + bool ReadAnyTokensBeforeDirective) { + ++NumIf; + + // Parse and evaluation the conditional expression. + IdentifierInfo *IfNDefMacro = 0; + bool ConditionalTrue = EvaluateDirectiveExpression(IfNDefMacro); + + // Should we include the stuff contained by this directive? + if (ConditionalTrue) { + // If this condition is equivalent to #ifndef X, and if this is the first + // directive seen, handle it for the multiple-include optimization. + if (CurLexer->getConditionalStackDepth() == 0) { + if (!ReadAnyTokensBeforeDirective && IfNDefMacro) + CurLexer->MIOpt.EnterTopLevelIFNDEF(IfNDefMacro); + else + CurLexer->MIOpt.EnterTopLevelConditional(); + } + + // Yes, remember that we are inside a conditional, then lex the next token. + CurLexer->pushConditionalLevel(IfToken.getLocation(), /*wasskip*/false, + /*foundnonskip*/true, /*foundelse*/false); + } else { + // No, skip the contents of this block and return the first token after it. + SkipExcludedConditionalBlock(IfToken.getLocation(), /*Foundnonskip*/false, + /*FoundElse*/false); + } +} + +/// HandleEndifDirective - Implements the #endif directive. +/// +void Preprocessor::HandleEndifDirective(Token &EndifToken) { + ++NumEndif; + + // Check that this is the whole directive. + CheckEndOfDirective("#endif"); + + PPConditionalInfo CondInfo; + if (CurLexer->popConditionalLevel(CondInfo)) { + // No conditionals on the stack: this is an #endif without an #if. + return Diag(EndifToken, diag::err_pp_endif_without_if); + } + + // If this the end of a top-level #endif, inform MIOpt. + if (CurLexer->getConditionalStackDepth() == 0) + CurLexer->MIOpt.ExitTopLevelConditional(); + + assert(!CondInfo.WasSkipping && !CurLexer->LexingRawMode && + "This code should only be reachable in the non-skipping case!"); +} + + +void Preprocessor::HandleElseDirective(Token &Result) { + ++NumElse; + + // #else directive in a non-skipping conditional... start skipping. + CheckEndOfDirective("#else"); + + PPConditionalInfo CI; + if (CurLexer->popConditionalLevel(CI)) + return Diag(Result, diag::pp_err_else_without_if); + + // If this is a top-level #else, inform the MIOpt. + if (CurLexer->getConditionalStackDepth() == 0) + CurLexer->MIOpt.EnterTopLevelConditional(); + + // If this is a #else with a #else before it, report the error. + if (CI.FoundElse) Diag(Result, diag::pp_err_else_after_else); + + // Finally, skip the rest of the contents of this block and return the first + // token after it. + return SkipExcludedConditionalBlock(CI.IfLoc, /*Foundnonskip*/true, + /*FoundElse*/true); +} + +void Preprocessor::HandleElifDirective(Token &ElifToken) { + ++NumElse; + + // #elif directive in a non-skipping conditional... start skipping. + // We don't care what the condition is, because we will always skip it (since + // the block immediately before it was included). + DiscardUntilEndOfDirective(); + + PPConditionalInfo CI; + if (CurLexer->popConditionalLevel(CI)) + return Diag(ElifToken, diag::pp_err_elif_without_if); + + // If this is a top-level #elif, inform the MIOpt. + if (CurLexer->getConditionalStackDepth() == 0) + CurLexer->MIOpt.EnterTopLevelConditional(); + + // If this is a #elif with a #else before it, report the error. + if (CI.FoundElse) Diag(ElifToken, diag::pp_err_elif_after_else); + + // Finally, skip the rest of the contents of this block and return the first + // token after it. + return SkipExcludedConditionalBlock(CI.IfLoc, /*Foundnonskip*/true, + /*FoundElse*/CI.FoundElse); +} + diff --git a/clang/lib/Lex/PPExpressions.cpp b/clang/lib/Lex/PPExpressions.cpp new file mode 100644 index 00000000000..cca76289176 --- /dev/null +++ b/clang/lib/Lex/PPExpressions.cpp @@ -0,0 +1,639 @@ +//===--- PPExpressions.cpp - Preprocessor Expression Evaluation -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Preprocessor::EvaluateDirectiveExpression method, +// which parses and evaluates integer constant expressions for #if directives. +// +//===----------------------------------------------------------------------===// +// +// FIXME: implement testing for #assert's. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Lex/LiteralSupport.h" +#include "clang/Basic/TargetInfo.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Basic/Diagnostic.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/SmallString.h" +using namespace clang; + +static bool EvaluateDirectiveSubExpr(llvm::APSInt &LHS, unsigned MinPrec, + Token &PeekTok, bool ValueLive, + Preprocessor &PP); + +/// DefinedTracker - This struct is used while parsing expressions to keep track +/// of whether !defined(X) has been seen. +/// +/// With this simple scheme, we handle the basic forms: +/// !defined(X) and !defined X +/// but we also trivially handle (silly) stuff like: +/// !!!defined(X) and +!defined(X) and !+!+!defined(X) and !(defined(X)). +struct DefinedTracker { + /// Each time a Value is evaluated, it returns information about whether the + /// parsed value is of the form defined(X), !defined(X) or is something else. + enum TrackerState { + DefinedMacro, // defined(X) + NotDefinedMacro, // !defined(X) + Unknown // Something else. + } State; + /// TheMacro - When the state is DefinedMacro or NotDefinedMacro, this + /// indicates the macro that was checked. + IdentifierInfo *TheMacro; +}; + + + +/// EvaluateValue - Evaluate the token PeekTok (and any others needed) and +/// return the computed value in Result. Return true if there was an error +/// parsing. This function also returns information about the form of the +/// expression in DT. See above for information on what DT means. +/// +/// If ValueLive is false, then this value is being evaluated in a context where +/// the result is not used. As such, avoid diagnostics that relate to +/// evaluation. +static bool EvaluateValue(llvm::APSInt &Result, Token &PeekTok, + DefinedTracker &DT, bool ValueLive, + Preprocessor &PP) { + Result = 0; + DT.State = DefinedTracker::Unknown; + + // If this token's spelling is a pp-identifier, check to see if it is + // 'defined' or if it is a macro. Note that we check here because many + // keywords are pp-identifiers, so we can't check the kind. + if (IdentifierInfo *II = PeekTok.getIdentifierInfo()) { + // If this identifier isn't 'defined' and it wasn't macro expanded, it turns + // into a simple 0, unless it is the C++ keyword "true", in which case it + // turns into "1". + if (II->getPPKeywordID() != tok::pp_defined) { + PP.Diag(PeekTok, diag::warn_pp_undef_identifier, II->getName()); + Result = II->getTokenID() == tok::kw_true; + Result.setIsUnsigned(false); // "0" is signed intmax_t 0. + PP.LexNonComment(PeekTok); + return false; + } + + // Handle "defined X" and "defined(X)". + + // Get the next token, don't expand it. + PP.LexUnexpandedToken(PeekTok); + + // Two options, it can either be a pp-identifier or a (. + bool InParens = false; + if (PeekTok.is(tok::l_paren)) { + // Found a paren, remember we saw it and skip it. + InParens = true; + PP.LexUnexpandedToken(PeekTok); + } + + // If we don't have a pp-identifier now, this is an error. + if ((II = PeekTok.getIdentifierInfo()) == 0) { + PP.Diag(PeekTok, diag::err_pp_defined_requires_identifier); + return true; + } + + // Otherwise, we got an identifier, is it defined to something? + Result = II->hasMacroDefinition(); + Result.setIsUnsigned(false); // Result is signed intmax_t. + + // If there is a macro, mark it used. + if (Result != 0 && ValueLive) { + MacroInfo *Macro = PP.getMacroInfo(II); + Macro->setIsUsed(true); + } + + // Consume identifier. + PP.LexNonComment(PeekTok); + + // If we are in parens, ensure we have a trailing ). + if (InParens) { + if (PeekTok.isNot(tok::r_paren)) { + PP.Diag(PeekTok, diag::err_pp_missing_rparen); + return true; + } + // Consume the ). + PP.LexNonComment(PeekTok); + } + + // Success, remember that we saw defined(X). + DT.State = DefinedTracker::DefinedMacro; + DT.TheMacro = II; + return false; + } + + switch (PeekTok.getKind()) { + default: // Non-value token. + PP.Diag(PeekTok, diag::err_pp_expr_bad_token); + return true; + case tok::eom: + case tok::r_paren: + // If there is no expression, report and exit. + PP.Diag(PeekTok, diag::err_pp_expected_value_in_expr); + return true; + case tok::numeric_constant: { + llvm::SmallString<64> IntegerBuffer; + IntegerBuffer.resize(PeekTok.getLength()); + const char *ThisTokBegin = &IntegerBuffer[0]; + unsigned ActualLength = PP.getSpelling(PeekTok, ThisTokBegin); + NumericLiteralParser Literal(ThisTokBegin, ThisTokBegin+ActualLength, + PeekTok.getLocation(), PP); + if (Literal.hadError) + return true; // a diagnostic was already reported. + + if (Literal.isFloatingLiteral() || Literal.isImaginary) { + PP.Diag(PeekTok, diag::err_pp_illegal_floating_literal); + return true; + } + assert(Literal.isIntegerLiteral() && "Unknown ppnumber"); + + // long long is a C99 feature. + if (!PP.getLangOptions().C99 && !PP.getLangOptions().CPlusPlus0x + && Literal.isLongLong) + PP.Diag(PeekTok, diag::ext_longlong); + + // Parse the integer literal into Result. + if (Literal.GetIntegerValue(Result)) { + // Overflow parsing integer literal. + if (ValueLive) PP.Diag(PeekTok, diag::warn_integer_too_large); + Result.setIsUnsigned(true); + } else { + // Set the signedness of the result to match whether there was a U suffix + // or not. + Result.setIsUnsigned(Literal.isUnsigned); + + // Detect overflow based on whether the value is signed. If signed + // and if the value is too large, emit a warning "integer constant is so + // large that it is unsigned" e.g. on 12345678901234567890 where intmax_t + // is 64-bits. + if (!Literal.isUnsigned && Result.isNegative()) { + if (ValueLive)PP.Diag(PeekTok, diag::warn_integer_too_large_for_signed); + Result.setIsUnsigned(true); + } + } + + // Consume the token. + PP.LexNonComment(PeekTok); + return false; + } + case tok::char_constant: { // 'x' + llvm::SmallString<32> CharBuffer; + CharBuffer.resize(PeekTok.getLength()); + const char *ThisTokBegin = &CharBuffer[0]; + unsigned ActualLength = PP.getSpelling(PeekTok, ThisTokBegin); + CharLiteralParser Literal(ThisTokBegin, ThisTokBegin+ActualLength, + PeekTok.getLocation(), PP); + if (Literal.hadError()) + return true; // A diagnostic was already emitted. + + // Character literals are always int or wchar_t, expand to intmax_t. + TargetInfo &TI = PP.getTargetInfo(); + unsigned NumBits = TI.getCharWidth(Literal.isWide()); + + // Set the width. + llvm::APSInt Val(NumBits); + // Set the value. + Val = Literal.getValue(); + // Set the signedness. + Val.setIsUnsigned(!TI.isCharSigned()); + + if (Result.getBitWidth() > Val.getBitWidth()) { + Result = Val.extend(Result.getBitWidth()); + } else { + assert(Result.getBitWidth() == Val.getBitWidth() && + "intmax_t smaller than char/wchar_t?"); + Result = Val; + } + + // Consume the token. + PP.LexNonComment(PeekTok); + return false; + } + case tok::l_paren: + PP.LexNonComment(PeekTok); // Eat the (. + // Parse the value and if there are any binary operators involved, parse + // them. + if (EvaluateValue(Result, PeekTok, DT, ValueLive, PP)) return true; + + // If this is a silly value like (X), which doesn't need parens, check for + // !(defined X). + if (PeekTok.is(tok::r_paren)) { + // Just use DT unmodified as our result. + } else { + if (EvaluateDirectiveSubExpr(Result, 1, PeekTok, ValueLive, PP)) + return true; + + if (PeekTok.isNot(tok::r_paren)) { + PP.Diag(PeekTok, diag::err_pp_expected_rparen); + return true; + } + DT.State = DefinedTracker::Unknown; + } + PP.LexNonComment(PeekTok); // Eat the ). + return false; + + case tok::plus: + // Unary plus doesn't modify the value. + PP.LexNonComment(PeekTok); + return EvaluateValue(Result, PeekTok, DT, ValueLive, PP); + case tok::minus: { + SourceLocation Loc = PeekTok.getLocation(); + PP.LexNonComment(PeekTok); + if (EvaluateValue(Result, PeekTok, DT, ValueLive, PP)) return true; + // C99 6.5.3.3p3: The sign of the result matches the sign of the operand. + Result = -Result; + + bool Overflow = false; + if (Result.isUnsigned()) + Overflow = Result.isNegative(); + else if (Result.isMinSignedValue()) + Overflow = true; // -MININT is the only thing that overflows. + + // If this operator is live and overflowed, report the issue. + if (Overflow && ValueLive) + PP.Diag(Loc, diag::warn_pp_expr_overflow); + + DT.State = DefinedTracker::Unknown; + return false; + } + + case tok::tilde: + PP.LexNonComment(PeekTok); + if (EvaluateValue(Result, PeekTok, DT, ValueLive, PP)) return true; + // C99 6.5.3.3p4: The sign of the result matches the sign of the operand. + Result = ~Result; + DT.State = DefinedTracker::Unknown; + return false; + + case tok::exclaim: + PP.LexNonComment(PeekTok); + if (EvaluateValue(Result, PeekTok, DT, ValueLive, PP)) return true; + Result = !Result; + // C99 6.5.3.3p5: The sign of the result is 'int', aka it is signed. + Result.setIsUnsigned(false); + + if (DT.State == DefinedTracker::DefinedMacro) + DT.State = DefinedTracker::NotDefinedMacro; + else if (DT.State == DefinedTracker::NotDefinedMacro) + DT.State = DefinedTracker::DefinedMacro; + return false; + + // FIXME: Handle #assert + } +} + + + +/// getPrecedence - Return the precedence of the specified binary operator +/// token. This returns: +/// ~0 - Invalid token. +/// 14 - *,/,% +/// 13 - -,+ +/// 12 - <<,>> +/// 11 - >=, <=, >, < +/// 10 - ==, != +/// 9 - & +/// 8 - ^ +/// 7 - | +/// 6 - && +/// 5 - || +/// 4 - ? +/// 3 - : +/// 0 - eom, ) +static unsigned getPrecedence(tok::TokenKind Kind) { + switch (Kind) { + default: return ~0U; + case tok::percent: + case tok::slash: + case tok::star: return 14; + case tok::plus: + case tok::minus: return 13; + case tok::lessless: + case tok::greatergreater: return 12; + case tok::lessequal: + case tok::less: + case tok::greaterequal: + case tok::greater: return 11; + case tok::exclaimequal: + case tok::equalequal: return 10; + case tok::amp: return 9; + case tok::caret: return 8; + case tok::pipe: return 7; + case tok::ampamp: return 6; + case tok::pipepipe: return 5; + case tok::question: return 4; + case tok::colon: return 3; + case tok::comma: return 2; + case tok::r_paren: return 0; // Lowest priority, end of expr. + case tok::eom: return 0; // Lowest priority, end of macro. + } +} + + +/// EvaluateDirectiveSubExpr - Evaluate the subexpression whose first token is +/// PeekTok, and whose precedence is PeekPrec. +/// +/// If ValueLive is false, then this value is being evaluated in a context where +/// the result is not used. As such, avoid diagnostics that relate to +/// evaluation. +static bool EvaluateDirectiveSubExpr(llvm::APSInt &LHS, unsigned MinPrec, + Token &PeekTok, bool ValueLive, + Preprocessor &PP) { + unsigned PeekPrec = getPrecedence(PeekTok.getKind()); + // If this token isn't valid, report the error. + if (PeekPrec == ~0U) { + PP.Diag(PeekTok, diag::err_pp_expr_bad_token); + return true; + } + + while (1) { + // If this token has a lower precedence than we are allowed to parse, return + // it so that higher levels of the recursion can parse it. + if (PeekPrec < MinPrec) + return false; + + tok::TokenKind Operator = PeekTok.getKind(); + + // If this is a short-circuiting operator, see if the RHS of the operator is + // dead. Note that this cannot just clobber ValueLive. Consider + // "0 && 1 ? 4 : 1 / 0", which is parsed as "(0 && 1) ? 4 : (1 / 0)". In + // this example, the RHS of the && being dead does not make the rest of the + // expr dead. + bool RHSIsLive; + if (Operator == tok::ampamp && LHS == 0) + RHSIsLive = false; // RHS of "0 && x" is dead. + else if (Operator == tok::pipepipe && LHS != 0) + RHSIsLive = false; // RHS of "1 || x" is dead. + else if (Operator == tok::question && LHS == 0) + RHSIsLive = false; // RHS (x) of "0 ? x : y" is dead. + else + RHSIsLive = ValueLive; + + // Consume the operator, saving the operator token for error reporting. + Token OpToken = PeekTok; + PP.LexNonComment(PeekTok); + + llvm::APSInt RHS(LHS.getBitWidth()); + // Parse the RHS of the operator. + DefinedTracker DT; + if (EvaluateValue(RHS, PeekTok, DT, RHSIsLive, PP)) return true; + + // Remember the precedence of this operator and get the precedence of the + // operator immediately to the right of the RHS. + unsigned ThisPrec = PeekPrec; + PeekPrec = getPrecedence(PeekTok.getKind()); + + // If this token isn't valid, report the error. + if (PeekPrec == ~0U) { + PP.Diag(PeekTok, diag::err_pp_expr_bad_token); + return true; + } + + bool isRightAssoc = Operator == tok::question; + + // Get the precedence of the operator to the right of the RHS. If it binds + // more tightly with RHS than we do, evaluate it completely first. + if (ThisPrec < PeekPrec || + (ThisPrec == PeekPrec && isRightAssoc)) { + if (EvaluateDirectiveSubExpr(RHS, ThisPrec+1, PeekTok, RHSIsLive, PP)) + return true; + PeekPrec = getPrecedence(PeekTok.getKind()); + } + assert(PeekPrec <= ThisPrec && "Recursion didn't work!"); + + // Usual arithmetic conversions (C99 6.3.1.8p1): result is unsigned if + // either operand is unsigned. Don't do this for x and y in "x ? y : z". + llvm::APSInt Res(LHS.getBitWidth()); + if (Operator != tok::question) { + Res.setIsUnsigned(LHS.isUnsigned()|RHS.isUnsigned()); + // If this just promoted something from signed to unsigned, and if the + // value was negative, warn about it. + if (ValueLive && Res.isUnsigned()) { + if (!LHS.isUnsigned() && LHS.isNegative()) + PP.Diag(OpToken, diag::warn_pp_convert_lhs_to_positive, + LHS.toStringSigned() + " to " + LHS.toStringUnsigned()); + if (!RHS.isUnsigned() && RHS.isNegative()) + PP.Diag(OpToken, diag::warn_pp_convert_rhs_to_positive, + RHS.toStringSigned() + " to " + RHS.toStringUnsigned()); + } + LHS.setIsUnsigned(Res.isUnsigned()); + RHS.setIsUnsigned(Res.isUnsigned()); + } + + // FIXME: All of these should detect and report overflow?? + bool Overflow = false; + switch (Operator) { + default: assert(0 && "Unknown operator token!"); + case tok::percent: + if (RHS == 0) { + if (ValueLive) PP.Diag(OpToken, diag::err_pp_remainder_by_zero); + return true; + } + Res = LHS % RHS; + break; + case tok::slash: + if (RHS == 0) { + if (ValueLive) PP.Diag(OpToken, diag::err_pp_division_by_zero); + return true; + } + Res = LHS / RHS; + if (LHS.isSigned()) + Overflow = LHS.isMinSignedValue() && RHS.isAllOnesValue(); // MININT/-1 + break; + case tok::star: + Res = LHS * RHS; + if (LHS != 0 && RHS != 0) + Overflow = Res/RHS != LHS || Res/LHS != RHS; + break; + case tok::lessless: { + // Determine whether overflow is about to happen. + unsigned ShAmt = static_cast<unsigned>(RHS.getLimitedValue()); + if (ShAmt >= LHS.getBitWidth()) + Overflow = true, ShAmt = LHS.getBitWidth()-1; + else if (LHS.isUnsigned()) + Overflow = ShAmt > LHS.countLeadingZeros(); + else if (LHS.isNonNegative()) + Overflow = ShAmt >= LHS.countLeadingZeros(); // Don't allow sign change. + else + Overflow = ShAmt >= LHS.countLeadingOnes(); + + Res = LHS << ShAmt; + break; + } + case tok::greatergreater: { + // Determine whether overflow is about to happen. + unsigned ShAmt = static_cast<unsigned>(RHS.getLimitedValue()); + if (ShAmt >= LHS.getBitWidth()) + Overflow = true, ShAmt = LHS.getBitWidth()-1; + Res = LHS >> ShAmt; + break; + } + case tok::plus: + Res = LHS + RHS; + if (LHS.isUnsigned()) + Overflow = Res.ult(LHS); + else if (LHS.isNonNegative() == RHS.isNonNegative() && + Res.isNonNegative() != LHS.isNonNegative()) + Overflow = true; // Overflow for signed addition. + break; + case tok::minus: + Res = LHS - RHS; + if (LHS.isUnsigned()) + Overflow = Res.ugt(LHS); + else if (LHS.isNonNegative() != RHS.isNonNegative() && + Res.isNonNegative() != LHS.isNonNegative()) + Overflow = true; // Overflow for signed subtraction. + break; + case tok::lessequal: + Res = LHS <= RHS; + Res.setIsUnsigned(false); // C99 6.5.8p6, result is always int (signed) + break; + case tok::less: + Res = LHS < RHS; + Res.setIsUnsigned(false); // C99 6.5.8p6, result is always int (signed) + break; + case tok::greaterequal: + Res = LHS >= RHS; + Res.setIsUnsigned(false); // C99 6.5.8p6, result is always int (signed) + break; + case tok::greater: + Res = LHS > RHS; + Res.setIsUnsigned(false); // C99 6.5.8p6, result is always int (signed) + break; + case tok::exclaimequal: + Res = LHS != RHS; + Res.setIsUnsigned(false); // C99 6.5.9p3, result is always int (signed) + break; + case tok::equalequal: + Res = LHS == RHS; + Res.setIsUnsigned(false); // C99 6.5.9p3, result is always int (signed) + break; + case tok::amp: + Res = LHS & RHS; + break; + case tok::caret: + Res = LHS ^ RHS; + break; + case tok::pipe: + Res = LHS | RHS; + break; + case tok::ampamp: + Res = (LHS != 0 && RHS != 0); + Res.setIsUnsigned(false); // C99 6.5.13p3, result is always int (signed) + break; + case tok::pipepipe: + Res = (LHS != 0 || RHS != 0); + Res.setIsUnsigned(false); // C99 6.5.14p3, result is always int (signed) + break; + case tok::comma: + PP.Diag(OpToken, diag::ext_pp_comma_expr); + Res = RHS; // LHS = LHS,RHS -> RHS. + break; + case tok::question: { + // Parse the : part of the expression. + if (PeekTok.isNot(tok::colon)) { + PP.Diag(OpToken, diag::err_pp_question_without_colon); + return true; + } + // Consume the :. + PP.LexNonComment(PeekTok); + + // Evaluate the value after the :. + bool AfterColonLive = ValueLive && LHS == 0; + llvm::APSInt AfterColonVal(LHS.getBitWidth()); + DefinedTracker DT; + if (EvaluateValue(AfterColonVal, PeekTok, DT, AfterColonLive, PP)) + return true; + + // Parse anything after the : RHS that has a higher precedence than ?. + if (EvaluateDirectiveSubExpr(AfterColonVal, ThisPrec+1, + PeekTok, AfterColonLive, PP)) + return true; + + // Now that we have the condition, the LHS and the RHS of the :, evaluate. + Res = LHS != 0 ? RHS : AfterColonVal; + + // Usual arithmetic conversions (C99 6.3.1.8p1): result is unsigned if + // either operand is unsigned. + Res.setIsUnsigned(RHS.isUnsigned() | AfterColonVal.isUnsigned()); + + // Figure out the precedence of the token after the : part. + PeekPrec = getPrecedence(PeekTok.getKind()); + break; + } + case tok::colon: + // Don't allow :'s to float around without being part of ?: exprs. + PP.Diag(OpToken, diag::err_pp_colon_without_question); + return true; + } + + // If this operator is live and overflowed, report the issue. + if (Overflow && ValueLive) + PP.Diag(OpToken, diag::warn_pp_expr_overflow); + + // Put the result back into 'LHS' for our next iteration. + LHS = Res; + } + + return false; +} + +/// EvaluateDirectiveExpression - Evaluate an integer constant expression that +/// may occur after a #if or #elif directive. If the expression is equivalent +/// to "!defined(X)" return X in IfNDefMacro. +bool Preprocessor:: +EvaluateDirectiveExpression(IdentifierInfo *&IfNDefMacro) { + // Peek ahead one token. + Token Tok; + Lex(Tok); + + // C99 6.10.1p3 - All expressions are evaluated as intmax_t or uintmax_t. + unsigned BitWidth = getTargetInfo().getIntMaxTWidth(); + + llvm::APSInt ResVal(BitWidth); + DefinedTracker DT; + if (EvaluateValue(ResVal, Tok, DT, true, *this)) { + // Parse error, skip the rest of the macro line. + if (Tok.isNot(tok::eom)) + DiscardUntilEndOfDirective(); + return false; + } + + // If we are at the end of the expression after just parsing a value, there + // must be no (unparenthesized) binary operators involved, so we can exit + // directly. + if (Tok.is(tok::eom)) { + // If the expression we parsed was of the form !defined(macro), return the + // macro in IfNDefMacro. + if (DT.State == DefinedTracker::NotDefinedMacro) + IfNDefMacro = DT.TheMacro; + + return ResVal != 0; + } + + // Otherwise, we must have a binary operator (e.g. "#if 1 < 2"), so parse the + // operator and the stuff after it. + if (EvaluateDirectiveSubExpr(ResVal, 1, Tok, true, *this)) { + // Parse error, skip the rest of the macro line. + if (Tok.isNot(tok::eom)) + DiscardUntilEndOfDirective(); + return false; + } + + // If we aren't at the tok::eom token, something bad happened, like an extra + // ')' token. + if (Tok.isNot(tok::eom)) { + Diag(Tok, diag::err_pp_expected_eol); + DiscardUntilEndOfDirective(); + } + + return ResVal != 0; +} + diff --git a/clang/lib/Lex/PPLexerChange.cpp b/clang/lib/Lex/PPLexerChange.cpp new file mode 100644 index 00000000000..bd0ff7f94a1 --- /dev/null +++ b/clang/lib/Lex/PPLexerChange.cpp @@ -0,0 +1,401 @@ +//===--- PPLexerChange.cpp - Handle changing lexers in the preprocessor ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements pieces of the Preprocessor interface that manage the +// current lexer stack. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Lex/PPCallbacks.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/SourceManager.h" +using namespace clang; + +PPCallbacks::~PPCallbacks() { +} + + +//===----------------------------------------------------------------------===// +// Miscellaneous Methods. +//===----------------------------------------------------------------------===// + +/// isInPrimaryFile - Return true if we're in the top-level file, not in a +/// #include. This looks through macro expansions and active _Pragma lexers. +bool Preprocessor::isInPrimaryFile() const { + if (CurLexer && !CurLexer->Is_PragmaLexer) + return IncludeMacroStack.empty(); + + // If there are any stacked lexers, we're in a #include. + assert(IncludeMacroStack[0].TheLexer && + !IncludeMacroStack[0].TheLexer->Is_PragmaLexer && + "Top level include stack isn't our primary lexer?"); + for (unsigned i = 1, e = IncludeMacroStack.size(); i != e; ++i) + if (IncludeMacroStack[i].TheLexer && + !IncludeMacroStack[i].TheLexer->Is_PragmaLexer) + return false; + return true; +} + +/// getCurrentLexer - Return the current file lexer being lexed from. Note +/// that this ignores any potentially active macro expansions and _Pragma +/// expansions going on at the time. +Lexer *Preprocessor::getCurrentFileLexer() const { + if (CurLexer && !CurLexer->Is_PragmaLexer) return CurLexer; + + // Look for a stacked lexer. + for (unsigned i = IncludeMacroStack.size(); i != 0; --i) { + Lexer *L = IncludeMacroStack[i-1].TheLexer; + if (L && !L->Is_PragmaLexer) // Ignore macro & _Pragma expansions. + return L; + } + return 0; +} + +/// LookAhead - This peeks ahead N tokens and returns that token without +/// consuming any tokens. LookAhead(0) returns 'Tok', LookAhead(1) returns +/// the token after Tok, etc. +/// +/// NOTE: is a relatively expensive method, so it should not be used in common +/// code paths if possible! +/// +Token Preprocessor::LookAhead(unsigned N) { + // FIXME: Optimize the case where multiple lookahead calls are used back to + // back. Consider if the the parser contained (dynamically): + // Lookahead(1); Lookahead(1); Lookahead(1) + // This would return the same token 3 times, but would end up making lots of + // token stream lexers to do it. To handle this common case, see if the top + // of the lexer stack is a TokenStreamLexer with macro expansion disabled. If + // so, see if it has 'N' tokens available in it. If so, just return the + // token. + + // FIXME: Optimize the case when the parser does multiple nearby lookahead + // calls. For example, consider: + // Lookahead(0); Lookahead(1); Lookahead(2); + // The previous optimization won't apply, and there won't be any space left in + // the array that was previously new'd. To handle this, always round up the + // size we new to a multiple of 16 tokens. If the previous buffer has space + // left, we can just grow it. This means we only have to do the new 1/16th as + // often. + + Token *LookaheadTokens = new Token[N]; + + // Read N+1 tokens into LookaheadTokens. After this loop, Tok is the token + // to return. + Token Tok; + unsigned NumTokens = 0; + for (; N != ~0U; --N, ++NumTokens) { + Lex(Tok); + LookaheadTokens[NumTokens] = Tok; + + // If we got to EOF, don't lex past it. This will cause LookAhead to return + // the EOF token. + if (Tok.is(tok::eof)) + break; + } + + // Okay, at this point, we have the token we want to return in Tok. However, + // we read it and a bunch of other stuff (in LookaheadTokens) that we must + // allow subsequent calls to 'Lex' to return. To do this, we push a new token + // lexer onto the lexer stack with the tokens we read here. This passes + // ownership of LookaheadTokens to EnterTokenStream. + // + // Note that we disable macro expansion of the tokens from this buffer, since + // any macros have already been expanded, and the internal preprocessor state + // may already read past new macros. Consider something like LookAhead(1) on + // X + // #define X 14 + // Y + // The lookahead call should return 'Y', and the next Lex call should return + // 'X' even though X -> 14 has already been entered as a macro. + // + EnterTokenStream(LookaheadTokens, NumTokens, true /*DisableExpansion*/, + true /*OwnsTokens*/); + return Tok; +} + + +//===----------------------------------------------------------------------===// +// Methods for Entering and Callbacks for leaving various contexts +//===----------------------------------------------------------------------===// + +/// EnterSourceFile - Add a source file to the top of the include stack and +/// start lexing tokens from it instead of the current buffer. Return true +/// on failure. +void Preprocessor::EnterSourceFile(unsigned FileID, + const DirectoryLookup *CurDir) { + assert(CurTokenLexer == 0 && "Cannot #include a file inside a macro!"); + ++NumEnteredSourceFiles; + + if (MaxIncludeStackDepth < IncludeMacroStack.size()) + MaxIncludeStackDepth = IncludeMacroStack.size(); + + Lexer *TheLexer = new Lexer(SourceLocation::getFileLoc(FileID, 0), *this); + EnterSourceFileWithLexer(TheLexer, CurDir); +} + +/// EnterSourceFile - Add a source file to the top of the include stack and +/// start lexing tokens from it instead of the current buffer. +void Preprocessor::EnterSourceFileWithLexer(Lexer *TheLexer, + const DirectoryLookup *CurDir) { + + // Add the current lexer to the include stack. + if (CurLexer || CurTokenLexer) + IncludeMacroStack.push_back(IncludeStackInfo(CurLexer, CurDirLookup, + CurTokenLexer)); + + CurLexer = TheLexer; + CurDirLookup = CurDir; + CurTokenLexer = 0; + + // Notify the client, if desired, that we are in a new source file. + if (Callbacks && !CurLexer->Is_PragmaLexer) { + DirectoryLookup::DirType FileType = DirectoryLookup::NormalHeaderDir; + + // Get the file entry for the current file. + if (const FileEntry *FE = + SourceMgr.getFileEntryForLoc(CurLexer->getFileLoc())) + FileType = HeaderInfo.getFileDirFlavor(FE); + + Callbacks->FileChanged(CurLexer->getFileLoc(), + PPCallbacks::EnterFile, FileType); + } +} + + + +/// EnterMacro - Add a Macro to the top of the include stack and start lexing +/// tokens from it instead of the current buffer. +void Preprocessor::EnterMacro(Token &Tok, MacroArgs *Args) { + IncludeMacroStack.push_back(IncludeStackInfo(CurLexer, CurDirLookup, + CurTokenLexer)); + CurLexer = 0; + CurDirLookup = 0; + + if (NumCachedTokenLexers == 0) { + CurTokenLexer = new TokenLexer(Tok, Args, *this); + } else { + CurTokenLexer = TokenLexerCache[--NumCachedTokenLexers]; + CurTokenLexer->Init(Tok, Args); + } +} + +/// EnterTokenStream - Add a "macro" context to the top of the include stack, +/// which will cause the lexer to start returning the specified tokens. +/// +/// If DisableMacroExpansion is true, tokens lexed from the token stream will +/// not be subject to further macro expansion. Otherwise, these tokens will +/// be re-macro-expanded when/if expansion is enabled. +/// +/// If OwnsTokens is false, this method assumes that the specified stream of +/// tokens has a permanent owner somewhere, so they do not need to be copied. +/// If it is true, it assumes the array of tokens is allocated with new[] and +/// must be freed. +/// +void Preprocessor::EnterTokenStream(const Token *Toks, unsigned NumToks, + bool DisableMacroExpansion, + bool OwnsTokens) { + // Save our current state. + IncludeMacroStack.push_back(IncludeStackInfo(CurLexer, CurDirLookup, + CurTokenLexer)); + CurLexer = 0; + CurDirLookup = 0; + + // Create a macro expander to expand from the specified token stream. + if (NumCachedTokenLexers == 0) { + CurTokenLexer = new TokenLexer(Toks, NumToks, DisableMacroExpansion, + OwnsTokens, *this); + } else { + CurTokenLexer = TokenLexerCache[--NumCachedTokenLexers]; + CurTokenLexer->Init(Toks, NumToks, DisableMacroExpansion, OwnsTokens); + } +} + +/// HandleEndOfFile - This callback is invoked when the lexer hits the end of +/// the current file. This either returns the EOF token or pops a level off +/// the include stack and keeps going. +bool Preprocessor::HandleEndOfFile(Token &Result, bool isEndOfMacro) { + assert(!CurTokenLexer && + "Ending a file when currently in a macro!"); + + // See if this file had a controlling macro. + if (CurLexer) { // Not ending a macro, ignore it. + if (const IdentifierInfo *ControllingMacro = + CurLexer->MIOpt.GetControllingMacroAtEndOfFile()) { + // Okay, this has a controlling macro, remember in PerFileInfo. + if (const FileEntry *FE = + SourceMgr.getFileEntryForLoc(CurLexer->getFileLoc())) + HeaderInfo.SetFileControllingMacro(FE, ControllingMacro); + } + } + + // If this is a #include'd file, pop it off the include stack and continue + // lexing the #includer file. + if (!IncludeMacroStack.empty()) { + // We're done with the #included file. + RemoveTopOfLexerStack(); + + // Notify the client, if desired, that we are in a new source file. + if (Callbacks && !isEndOfMacro && CurLexer) { + DirectoryLookup::DirType FileType = DirectoryLookup::NormalHeaderDir; + + // Get the file entry for the current file. + if (const FileEntry *FE = + SourceMgr.getFileEntryForLoc(CurLexer->getFileLoc())) + FileType = HeaderInfo.getFileDirFlavor(FE); + + Callbacks->FileChanged(CurLexer->getSourceLocation(CurLexer->BufferPtr), + PPCallbacks::ExitFile, FileType); + } + + // Client should lex another token. + return false; + } + + // If the file ends with a newline, form the EOF token on the newline itself, + // rather than "on the line following it", which doesn't exist. This makes + // diagnostics relating to the end of file include the last file that the user + // actually typed, which is goodness. + const char *EndPos = CurLexer->BufferEnd; + if (EndPos != CurLexer->BufferStart && + (EndPos[-1] == '\n' || EndPos[-1] == '\r')) { + --EndPos; + + // Handle \n\r and \r\n: + if (EndPos != CurLexer->BufferStart && + (EndPos[-1] == '\n' || EndPos[-1] == '\r') && + EndPos[-1] != EndPos[0]) + --EndPos; + } + + Result.startToken(); + CurLexer->BufferPtr = EndPos; + CurLexer->FormTokenWithChars(Result, EndPos); + Result.setKind(tok::eof); + + // We're done with the #included file. + delete CurLexer; + CurLexer = 0; + + // This is the end of the top-level file. If the diag::pp_macro_not_used + // diagnostic is enabled, look for macros that have not been used. + if (Diags.getDiagnosticLevel(diag::pp_macro_not_used) != Diagnostic::Ignored){ + for (llvm::DenseMap<IdentifierInfo*, MacroInfo*>::iterator I = + Macros.begin(), E = Macros.end(); I != E; ++I) { + if (!I->second->isUsed()) + Diag(I->second->getDefinitionLoc(), diag::pp_macro_not_used); + } + } + return true; +} + +/// HandleEndOfTokenLexer - This callback is invoked when the current TokenLexer +/// hits the end of its token stream. +bool Preprocessor::HandleEndOfTokenLexer(Token &Result) { + assert(CurTokenLexer && !CurLexer && + "Ending a macro when currently in a #include file!"); + + // Delete or cache the now-dead macro expander. + if (NumCachedTokenLexers == TokenLexerCacheSize) + delete CurTokenLexer; + else + TokenLexerCache[NumCachedTokenLexers++] = CurTokenLexer; + + // Handle this like a #include file being popped off the stack. + CurTokenLexer = 0; + return HandleEndOfFile(Result, true); +} + +/// RemoveTopOfLexerStack - Pop the current lexer/macro exp off the top of the +/// lexer stack. This should only be used in situations where the current +/// state of the top-of-stack lexer is unknown. +void Preprocessor::RemoveTopOfLexerStack() { + assert(!IncludeMacroStack.empty() && "Ran out of stack entries to load"); + + if (CurTokenLexer) { + // Delete or cache the now-dead macro expander. + if (NumCachedTokenLexers == TokenLexerCacheSize) + delete CurTokenLexer; + else + TokenLexerCache[NumCachedTokenLexers++] = CurTokenLexer; + } else { + delete CurLexer; + } + CurLexer = IncludeMacroStack.back().TheLexer; + CurDirLookup = IncludeMacroStack.back().TheDirLookup; + CurTokenLexer = IncludeMacroStack.back().TheTokenLexer; + IncludeMacroStack.pop_back(); +} + +/// HandleMicrosoftCommentPaste - When the macro expander pastes together a +/// comment (/##/) in microsoft mode, this method handles updating the current +/// state, returning the token on the next source line. +void Preprocessor::HandleMicrosoftCommentPaste(Token &Tok) { + assert(CurTokenLexer && !CurLexer && + "Pasted comment can only be formed from macro"); + + // We handle this by scanning for the closest real lexer, switching it to + // raw mode and preprocessor mode. This will cause it to return \n as an + // explicit EOM token. + Lexer *FoundLexer = 0; + bool LexerWasInPPMode = false; + for (unsigned i = 0, e = IncludeMacroStack.size(); i != e; ++i) { + IncludeStackInfo &ISI = *(IncludeMacroStack.end()-i-1); + if (ISI.TheLexer == 0) continue; // Scan for a real lexer. + + // Once we find a real lexer, mark it as raw mode (disabling macro + // expansions) and preprocessor mode (return EOM). We know that the lexer + // was *not* in raw mode before, because the macro that the comment came + // from was expanded. However, it could have already been in preprocessor + // mode (#if COMMENT) in which case we have to return it to that mode and + // return EOM. + FoundLexer = ISI.TheLexer; + FoundLexer->LexingRawMode = true; + LexerWasInPPMode = FoundLexer->ParsingPreprocessorDirective; + FoundLexer->ParsingPreprocessorDirective = true; + break; + } + + // Okay, we either found and switched over the lexer, or we didn't find a + // lexer. In either case, finish off the macro the comment came from, getting + // the next token. + if (!HandleEndOfTokenLexer(Tok)) Lex(Tok); + + // Discarding comments as long as we don't have EOF or EOM. This 'comments + // out' the rest of the line, including any tokens that came from other macros + // that were active, as in: + // #define submacro a COMMENT b + // submacro c + // which should lex to 'a' only: 'b' and 'c' should be removed. + while (Tok.isNot(tok::eom) && Tok.isNot(tok::eof)) + Lex(Tok); + + // If we got an eom token, then we successfully found the end of the line. + if (Tok.is(tok::eom)) { + assert(FoundLexer && "Can't get end of line without an active lexer"); + // Restore the lexer back to normal mode instead of raw mode. + FoundLexer->LexingRawMode = false; + + // If the lexer was already in preprocessor mode, just return the EOM token + // to finish the preprocessor line. + if (LexerWasInPPMode) return; + + // Otherwise, switch out of PP mode and return the next lexed token. + FoundLexer->ParsingPreprocessorDirective = false; + return Lex(Tok); + } + + // If we got an EOF token, then we reached the end of the token stream but + // didn't find an explicit \n. This can only happen if there was no lexer + // active (an active lexer would return EOM at EOF if there was no \n in + // preprocessor directive mode), so just return EOF as our token. + assert(!FoundLexer && "Lexer should return EOM before EOF in PP mode"); +} diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp new file mode 100644 index 00000000000..8218d0ac06e --- /dev/null +++ b/clang/lib/Lex/PPMacroExpansion.cpp @@ -0,0 +1,523 @@ +//===--- MacroExpansion.cpp - Top level Macro Expansion -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the top level handling of macro expasion for the +// preprocessor. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/Preprocessor.h" +#include "MacroArgs.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/Diagnostic.h" +using namespace clang; + +/// setMacroInfo - Specify a macro for this identifier. +/// +void Preprocessor::setMacroInfo(IdentifierInfo *II, MacroInfo *MI) { + if (MI == 0) { + if (II->hasMacroDefinition()) { + Macros.erase(II); + II->setHasMacroDefinition(false); + } + } else { + Macros[II] = MI; + II->setHasMacroDefinition(true); + } +} + +/// RegisterBuiltinMacro - Register the specified identifier in the identifier +/// table and mark it as a builtin macro to be expanded. +IdentifierInfo *Preprocessor::RegisterBuiltinMacro(const char *Name) { + // Get the identifier. + IdentifierInfo *Id = getIdentifierInfo(Name); + + // Mark it as being a macro that is builtin. + MacroInfo *MI = new MacroInfo(SourceLocation()); + MI->setIsBuiltinMacro(); + setMacroInfo(Id, MI); + return Id; +} + + +/// RegisterBuiltinMacros - Register builtin macros, such as __LINE__ with the +/// identifier table. +void Preprocessor::RegisterBuiltinMacros() { + Ident__LINE__ = RegisterBuiltinMacro("__LINE__"); + Ident__FILE__ = RegisterBuiltinMacro("__FILE__"); + Ident__DATE__ = RegisterBuiltinMacro("__DATE__"); + Ident__TIME__ = RegisterBuiltinMacro("__TIME__"); + Ident_Pragma = RegisterBuiltinMacro("_Pragma"); + + // GCC Extensions. + Ident__BASE_FILE__ = RegisterBuiltinMacro("__BASE_FILE__"); + Ident__INCLUDE_LEVEL__ = RegisterBuiltinMacro("__INCLUDE_LEVEL__"); + Ident__TIMESTAMP__ = RegisterBuiltinMacro("__TIMESTAMP__"); +} + +/// isTrivialSingleTokenExpansion - Return true if MI, which has a single token +/// in its expansion, currently expands to that token literally. +static bool isTrivialSingleTokenExpansion(const MacroInfo *MI, + const IdentifierInfo *MacroIdent, + Preprocessor &PP) { + IdentifierInfo *II = MI->getReplacementToken(0).getIdentifierInfo(); + + // If the token isn't an identifier, it's always literally expanded. + if (II == 0) return true; + + // If the identifier is a macro, and if that macro is enabled, it may be + // expanded so it's not a trivial expansion. + if (II->hasMacroDefinition() && PP.getMacroInfo(II)->isEnabled() && + // Fast expanding "#define X X" is ok, because X would be disabled. + II != MacroIdent) + return false; + + // If this is an object-like macro invocation, it is safe to trivially expand + // it. + if (MI->isObjectLike()) return true; + + // If this is a function-like macro invocation, it's safe to trivially expand + // as long as the identifier is not a macro argument. + for (MacroInfo::arg_iterator I = MI->arg_begin(), E = MI->arg_end(); + I != E; ++I) + if (*I == II) + return false; // Identifier is a macro argument. + + return true; +} + + +/// isNextPPTokenLParen - Determine whether the next preprocessor token to be +/// lexed is a '('. If so, consume the token and return true, if not, this +/// method should have no observable side-effect on the lexed tokens. +bool Preprocessor::isNextPPTokenLParen() { + // Do some quick tests for rejection cases. + unsigned Val; + if (CurLexer) + Val = CurLexer->isNextPPTokenLParen(); + else + Val = CurTokenLexer->isNextTokenLParen(); + + if (Val == 2) { + // We have run off the end. If it's a source file we don't + // examine enclosing ones (C99 5.1.1.2p4). Otherwise walk up the + // macro stack. + if (CurLexer) + return false; + for (unsigned i = IncludeMacroStack.size(); i != 0; --i) { + IncludeStackInfo &Entry = IncludeMacroStack[i-1]; + if (Entry.TheLexer) + Val = Entry.TheLexer->isNextPPTokenLParen(); + else + Val = Entry.TheTokenLexer->isNextTokenLParen(); + + if (Val != 2) + break; + + // Ran off the end of a source file? + if (Entry.TheLexer) + return false; + } + } + + // Okay, if we know that the token is a '(', lex it and return. Otherwise we + // have found something that isn't a '(' or we found the end of the + // translation unit. In either case, return false. + if (Val != 1) + return false; + + Token Tok; + LexUnexpandedToken(Tok); + assert(Tok.is(tok::l_paren) && "Error computing l-paren-ness?"); + return true; +} + +/// HandleMacroExpandedIdentifier - If an identifier token is read that is to be +/// expanded as a macro, handle it and return the next token as 'Identifier'. +bool Preprocessor::HandleMacroExpandedIdentifier(Token &Identifier, + MacroInfo *MI) { + // If this is a macro exapnsion in the "#if !defined(x)" line for the file, + // then the macro could expand to different things in other contexts, we need + // to disable the optimization in this case. + if (CurLexer) CurLexer->MIOpt.ExpandedMacro(); + + // If this is a builtin macro, like __LINE__ or _Pragma, handle it specially. + if (MI->isBuiltinMacro()) { + ExpandBuiltinMacro(Identifier); + return false; + } + + /// Args - If this is a function-like macro expansion, this contains, + /// for each macro argument, the list of tokens that were provided to the + /// invocation. + MacroArgs *Args = 0; + + // If this is a function-like macro, read the arguments. + if (MI->isFunctionLike()) { + // C99 6.10.3p10: If the preprocessing token immediately after the the macro + // name isn't a '(', this macro should not be expanded. Otherwise, consume + // it. + if (!isNextPPTokenLParen()) + return true; + + // Remember that we are now parsing the arguments to a macro invocation. + // Preprocessor directives used inside macro arguments are not portable, and + // this enables the warning. + InMacroArgs = true; + Args = ReadFunctionLikeMacroArgs(Identifier, MI); + + // Finished parsing args. + InMacroArgs = false; + + // If there was an error parsing the arguments, bail out. + if (Args == 0) return false; + + ++NumFnMacroExpanded; + } else { + ++NumMacroExpanded; + } + + // Notice that this macro has been used. + MI->setIsUsed(true); + + // If we started lexing a macro, enter the macro expansion body. + + // If this macro expands to no tokens, don't bother to push it onto the + // expansion stack, only to take it right back off. + if (MI->getNumTokens() == 0) { + // No need for arg info. + if (Args) Args->destroy(); + + // Ignore this macro use, just return the next token in the current + // buffer. + bool HadLeadingSpace = Identifier.hasLeadingSpace(); + bool IsAtStartOfLine = Identifier.isAtStartOfLine(); + + Lex(Identifier); + + // If the identifier isn't on some OTHER line, inherit the leading + // whitespace/first-on-a-line property of this token. This handles + // stuff like "! XX," -> "! ," and " XX," -> " ,", when XX is + // empty. + if (!Identifier.isAtStartOfLine()) { + if (IsAtStartOfLine) Identifier.setFlag(Token::StartOfLine); + if (HadLeadingSpace) Identifier.setFlag(Token::LeadingSpace); + } + ++NumFastMacroExpanded; + return false; + + } else if (MI->getNumTokens() == 1 && + isTrivialSingleTokenExpansion(MI, Identifier.getIdentifierInfo(), + *this)){ + // Otherwise, if this macro expands into a single trivially-expanded + // token: expand it now. This handles common cases like + // "#define VAL 42". + + // Propagate the isAtStartOfLine/hasLeadingSpace markers of the macro + // identifier to the expanded token. + bool isAtStartOfLine = Identifier.isAtStartOfLine(); + bool hasLeadingSpace = Identifier.hasLeadingSpace(); + + // Remember where the token is instantiated. + SourceLocation InstantiateLoc = Identifier.getLocation(); + + // Replace the result token. + Identifier = MI->getReplacementToken(0); + + // Restore the StartOfLine/LeadingSpace markers. + Identifier.setFlagValue(Token::StartOfLine , isAtStartOfLine); + Identifier.setFlagValue(Token::LeadingSpace, hasLeadingSpace); + + // Update the tokens location to include both its logical and physical + // locations. + SourceLocation Loc = + SourceMgr.getInstantiationLoc(Identifier.getLocation(), InstantiateLoc); + Identifier.setLocation(Loc); + + // If this is #define X X, we must mark the result as unexpandible. + if (IdentifierInfo *NewII = Identifier.getIdentifierInfo()) + if (getMacroInfo(NewII) == MI) + Identifier.setFlag(Token::DisableExpand); + + // Since this is not an identifier token, it can't be macro expanded, so + // we're done. + ++NumFastMacroExpanded; + return false; + } + + // Start expanding the macro. + EnterMacro(Identifier, Args); + + // Now that the macro is at the top of the include stack, ask the + // preprocessor to read the next token from it. + Lex(Identifier); + return false; +} + +/// ReadFunctionLikeMacroArgs - After reading "MACRO(", this method is +/// invoked to read all of the actual arguments specified for the macro +/// invocation. This returns null on error. +MacroArgs *Preprocessor::ReadFunctionLikeMacroArgs(Token &MacroName, + MacroInfo *MI) { + // The number of fixed arguments to parse. + unsigned NumFixedArgsLeft = MI->getNumArgs(); + bool isVariadic = MI->isVariadic(); + + // Outer loop, while there are more arguments, keep reading them. + Token Tok; + Tok.setKind(tok::comma); + --NumFixedArgsLeft; // Start reading the first arg. + + // ArgTokens - Build up a list of tokens that make up each argument. Each + // argument is separated by an EOF token. Use a SmallVector so we can avoid + // heap allocations in the common case. + llvm::SmallVector<Token, 64> ArgTokens; + + unsigned NumActuals = 0; + while (Tok.is(tok::comma)) { + // C99 6.10.3p11: Keep track of the number of l_parens we have seen. Note + // that we already consumed the first one. + unsigned NumParens = 0; + + while (1) { + // Read arguments as unexpanded tokens. This avoids issues, e.g., where + // an argument value in a macro could expand to ',' or '(' or ')'. + LexUnexpandedToken(Tok); + + if (Tok.is(tok::eof) || Tok.is(tok::eom)) { // "#if f(<eof>" & "#if f(\n" + Diag(MacroName, diag::err_unterm_macro_invoc); + // Do not lose the EOF/EOM. Return it to the client. + MacroName = Tok; + return 0; + } else if (Tok.is(tok::r_paren)) { + // If we found the ) token, the macro arg list is done. + if (NumParens-- == 0) + break; + } else if (Tok.is(tok::l_paren)) { + ++NumParens; + } else if (Tok.is(tok::comma) && NumParens == 0) { + // Comma ends this argument if there are more fixed arguments expected. + if (NumFixedArgsLeft) + break; + + // If this is not a variadic macro, too many args were specified. + if (!isVariadic) { + // Emit the diagnostic at the macro name in case there is a missing ). + // Emitting it at the , could be far away from the macro name. + Diag(MacroName, diag::err_too_many_args_in_macro_invoc); + return 0; + } + // Otherwise, continue to add the tokens to this variable argument. + } else if (Tok.is(tok::comment) && !KeepMacroComments) { + // If this is a comment token in the argument list and we're just in + // -C mode (not -CC mode), discard the comment. + continue; + } else if (Tok.is(tok::identifier)) { + // Reading macro arguments can cause macros that we are currently + // expanding from to be popped off the expansion stack. Doing so causes + // them to be reenabled for expansion. Here we record whether any + // identifiers we lex as macro arguments correspond to disabled macros. + // If so, we mark the token as noexpand. This is a subtle aspect of + // C99 6.10.3.4p2. + if (MacroInfo *MI = getMacroInfo(Tok.getIdentifierInfo())) + if (!MI->isEnabled()) + Tok.setFlag(Token::DisableExpand); + } + + ArgTokens.push_back(Tok); + } + + // Empty arguments are standard in C99 and supported as an extension in + // other modes. + if (ArgTokens.empty() && !Features.C99) + Diag(Tok, diag::ext_empty_fnmacro_arg); + + // Add a marker EOF token to the end of the token list for this argument. + Token EOFTok; + EOFTok.startToken(); + EOFTok.setKind(tok::eof); + EOFTok.setLocation(Tok.getLocation()); + EOFTok.setLength(0); + ArgTokens.push_back(EOFTok); + ++NumActuals; + --NumFixedArgsLeft; + }; + + // Okay, we either found the r_paren. Check to see if we parsed too few + // arguments. + unsigned MinArgsExpected = MI->getNumArgs(); + + // See MacroArgs instance var for description of this. + bool isVarargsElided = false; + + if (NumActuals < MinArgsExpected) { + // There are several cases where too few arguments is ok, handle them now. + if (NumActuals+1 == MinArgsExpected && MI->isVariadic()) { + // Varargs where the named vararg parameter is missing: ok as extension. + // #define A(x, ...) + // A("blah") + Diag(Tok, diag::ext_missing_varargs_arg); + + // Remember this occurred if this is a C99 macro invocation with at least + // one actual argument. + isVarargsElided = MI->isC99Varargs() && MI->getNumArgs() > 1; + } else if (MI->getNumArgs() == 1) { + // #define A(x) + // A() + // is ok because it is an empty argument. + + // Empty arguments are standard in C99 and supported as an extension in + // other modes. + if (ArgTokens.empty() && !Features.C99) + Diag(Tok, diag::ext_empty_fnmacro_arg); + } else { + // Otherwise, emit the error. + Diag(Tok, diag::err_too_few_args_in_macro_invoc); + return 0; + } + + // Add a marker EOF token to the end of the token list for this argument. + SourceLocation EndLoc = Tok.getLocation(); + Tok.startToken(); + Tok.setKind(tok::eof); + Tok.setLocation(EndLoc); + Tok.setLength(0); + ArgTokens.push_back(Tok); + } + + return MacroArgs::create(MI, &ArgTokens[0], ArgTokens.size(),isVarargsElided); +} + +/// ComputeDATE_TIME - Compute the current time, enter it into the specified +/// scratch buffer, then return DATELoc/TIMELoc locations with the position of +/// the identifier tokens inserted. +static void ComputeDATE_TIME(SourceLocation &DATELoc, SourceLocation &TIMELoc, + Preprocessor &PP) { + time_t TT = time(0); + struct tm *TM = localtime(&TT); + + static const char * const Months[] = { + "Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec" + }; + + char TmpBuffer[100]; + sprintf(TmpBuffer, "\"%s %2d %4d\"", Months[TM->tm_mon], TM->tm_mday, + TM->tm_year+1900); + DATELoc = PP.CreateString(TmpBuffer, strlen(TmpBuffer)); + + sprintf(TmpBuffer, "\"%02d:%02d:%02d\"", TM->tm_hour, TM->tm_min, TM->tm_sec); + TIMELoc = PP.CreateString(TmpBuffer, strlen(TmpBuffer)); +} + +/// ExpandBuiltinMacro - If an identifier token is read that is to be expanded +/// as a builtin macro, handle it and return the next token as 'Tok'. +void Preprocessor::ExpandBuiltinMacro(Token &Tok) { + // Figure out which token this is. + IdentifierInfo *II = Tok.getIdentifierInfo(); + assert(II && "Can't be a macro without id info!"); + + // If this is an _Pragma directive, expand it, invoke the pragma handler, then + // lex the token after it. + if (II == Ident_Pragma) + return Handle_Pragma(Tok); + + ++NumBuiltinMacroExpanded; + + char TmpBuffer[100]; + + // Set up the return result. + Tok.setIdentifierInfo(0); + Tok.clearFlag(Token::NeedsCleaning); + + if (II == Ident__LINE__) { + // __LINE__ expands to a simple numeric value. + sprintf(TmpBuffer, "%u", SourceMgr.getLogicalLineNumber(Tok.getLocation())); + unsigned Length = strlen(TmpBuffer); + Tok.setKind(tok::numeric_constant); + Tok.setLength(Length); + Tok.setLocation(CreateString(TmpBuffer, Length, Tok.getLocation())); + } else if (II == Ident__FILE__ || II == Ident__BASE_FILE__) { + SourceLocation Loc = Tok.getLocation(); + if (II == Ident__BASE_FILE__) { + Diag(Tok, diag::ext_pp_base_file); + SourceLocation NextLoc = SourceMgr.getIncludeLoc(Loc); + while (NextLoc.isValid()) { + Loc = NextLoc; + NextLoc = SourceMgr.getIncludeLoc(Loc); + } + } + + // Escape this filename. Turn '\' -> '\\' '"' -> '\"' + std::string FN = SourceMgr.getSourceName(SourceMgr.getLogicalLoc(Loc)); + FN = '"' + Lexer::Stringify(FN) + '"'; + Tok.setKind(tok::string_literal); + Tok.setLength(FN.size()); + Tok.setLocation(CreateString(&FN[0], FN.size(), Tok.getLocation())); + } else if (II == Ident__DATE__) { + if (!DATELoc.isValid()) + ComputeDATE_TIME(DATELoc, TIMELoc, *this); + Tok.setKind(tok::string_literal); + Tok.setLength(strlen("\"Mmm dd yyyy\"")); + Tok.setLocation(SourceMgr.getInstantiationLoc(DATELoc, Tok.getLocation())); + } else if (II == Ident__TIME__) { + if (!TIMELoc.isValid()) + ComputeDATE_TIME(DATELoc, TIMELoc, *this); + Tok.setKind(tok::string_literal); + Tok.setLength(strlen("\"hh:mm:ss\"")); + Tok.setLocation(SourceMgr.getInstantiationLoc(TIMELoc, Tok.getLocation())); + } else if (II == Ident__INCLUDE_LEVEL__) { + Diag(Tok, diag::ext_pp_include_level); + + // Compute the include depth of this token. + unsigned Depth = 0; + SourceLocation Loc = SourceMgr.getIncludeLoc(Tok.getLocation()); + for (; Loc.isValid(); ++Depth) + Loc = SourceMgr.getIncludeLoc(Loc); + + // __INCLUDE_LEVEL__ expands to a simple numeric value. + sprintf(TmpBuffer, "%u", Depth); + unsigned Length = strlen(TmpBuffer); + Tok.setKind(tok::numeric_constant); + Tok.setLength(Length); + Tok.setLocation(CreateString(TmpBuffer, Length, Tok.getLocation())); + } else if (II == Ident__TIMESTAMP__) { + // MSVC, ICC, GCC, VisualAge C++ extension. The generated string should be + // of the form "Ddd Mmm dd hh::mm::ss yyyy", which is returned by asctime. + Diag(Tok, diag::ext_pp_timestamp); + + // Get the file that we are lexing out of. If we're currently lexing from + // a macro, dig into the include stack. + const FileEntry *CurFile = 0; + Lexer *TheLexer = getCurrentFileLexer(); + + if (TheLexer) + CurFile = SourceMgr.getFileEntryForLoc(TheLexer->getFileLoc()); + + // If this file is older than the file it depends on, emit a diagnostic. + const char *Result; + if (CurFile) { + time_t TT = CurFile->getModificationTime(); + struct tm *TM = localtime(&TT); + Result = asctime(TM); + } else { + Result = "??? ??? ?? ??:??:?? ????\n"; + } + TmpBuffer[0] = '"'; + strcpy(TmpBuffer+1, Result); + unsigned Len = strlen(TmpBuffer); + TmpBuffer[Len-1] = '"'; // Replace the newline with a quote. + Tok.setKind(tok::string_literal); + Tok.setLength(Len); + Tok.setLocation(CreateString(TmpBuffer, Len, Tok.getLocation())); + } else { + assert(0 && "Unknown identifier!"); + } +} diff --git a/clang/lib/Lex/Pragma.cpp b/clang/lib/Lex/Pragma.cpp new file mode 100644 index 00000000000..08ad1cf1d2f --- /dev/null +++ b/clang/lib/Lex/Pragma.cpp @@ -0,0 +1,386 @@ +//===--- Pragma.cpp - Pragma registration and handling --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the PragmaHandler/PragmaTable interfaces and implements +// pragma related methods of the Preprocessor class. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/Pragma.h" +#include "clang/Lex/PPCallbacks.h" +#include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/SourceManager.h" +#include "llvm/ADT/SmallVector.h" +using namespace clang; + +// Out-of-line destructor to provide a home for the class. +PragmaHandler::~PragmaHandler() { +} + +//===----------------------------------------------------------------------===// +// PragmaNamespace Implementation. +//===----------------------------------------------------------------------===// + + +PragmaNamespace::~PragmaNamespace() { + for (unsigned i = 0, e = Handlers.size(); i != e; ++i) + delete Handlers[i]; +} + +/// FindHandler - Check to see if there is already a handler for the +/// specified name. If not, return the handler for the null identifier if it +/// exists, otherwise return null. If IgnoreNull is true (the default) then +/// the null handler isn't returned on failure to match. +PragmaHandler *PragmaNamespace::FindHandler(const IdentifierInfo *Name, + bool IgnoreNull) const { + PragmaHandler *NullHandler = 0; + for (unsigned i = 0, e = Handlers.size(); i != e; ++i) { + if (Handlers[i]->getName() == Name) + return Handlers[i]; + + if (Handlers[i]->getName() == 0) + NullHandler = Handlers[i]; + } + return IgnoreNull ? 0 : NullHandler; +} + +void PragmaNamespace::HandlePragma(Preprocessor &PP, Token &Tok) { + // Read the 'namespace' that the directive is in, e.g. STDC. Do not macro + // expand it, the user can have a STDC #define, that should not affect this. + PP.LexUnexpandedToken(Tok); + + // Get the handler for this token. If there is no handler, ignore the pragma. + PragmaHandler *Handler = FindHandler(Tok.getIdentifierInfo(), false); + if (Handler == 0) return; + + // Otherwise, pass it down. + Handler->HandlePragma(PP, Tok); +} + +//===----------------------------------------------------------------------===// +// Preprocessor Pragma Directive Handling. +//===----------------------------------------------------------------------===// + +/// HandlePragmaDirective - The "#pragma" directive has been parsed. Lex the +/// rest of the pragma, passing it to the registered pragma handlers. +void Preprocessor::HandlePragmaDirective() { + ++NumPragma; + + // Invoke the first level of pragma handlers which reads the namespace id. + Token Tok; + PragmaHandlers->HandlePragma(*this, Tok); + + // If the pragma handler didn't read the rest of the line, consume it now. + if (CurLexer->ParsingPreprocessorDirective) + DiscardUntilEndOfDirective(); +} + +/// Handle_Pragma - Read a _Pragma directive, slice it up, process it, then +/// return the first token after the directive. The _Pragma token has just +/// been read into 'Tok'. +void Preprocessor::Handle_Pragma(Token &Tok) { + // Remember the pragma token location. + SourceLocation PragmaLoc = Tok.getLocation(); + + // Read the '('. + Lex(Tok); + if (Tok.isNot(tok::l_paren)) + return Diag(PragmaLoc, diag::err__Pragma_malformed); + + // Read the '"..."'. + Lex(Tok); + if (Tok.isNot(tok::string_literal) && Tok.isNot(tok::wide_string_literal)) + return Diag(PragmaLoc, diag::err__Pragma_malformed); + + // Remember the string. + std::string StrVal = getSpelling(Tok); + SourceLocation StrLoc = Tok.getLocation(); + + // Read the ')'. + Lex(Tok); + if (Tok.isNot(tok::r_paren)) + return Diag(PragmaLoc, diag::err__Pragma_malformed); + + // The _Pragma is lexically sound. Destringize according to C99 6.10.9.1. + if (StrVal[0] == 'L') // Remove L prefix. + StrVal.erase(StrVal.begin()); + assert(StrVal[0] == '"' && StrVal[StrVal.size()-1] == '"' && + "Invalid string token!"); + + // Remove the front quote, replacing it with a space, so that the pragma + // contents appear to have a space before them. + StrVal[0] = ' '; + + // Replace the terminating quote with a \n\0. + StrVal[StrVal.size()-1] = '\n'; + StrVal += '\0'; + + // Remove escaped quotes and escapes. + for (unsigned i = 0, e = StrVal.size(); i != e-1; ++i) { + if (StrVal[i] == '\\' && + (StrVal[i+1] == '\\' || StrVal[i+1] == '"')) { + // \\ -> '\' and \" -> '"'. + StrVal.erase(StrVal.begin()+i); + --e; + } + } + + // Plop the string (including the newline and trailing null) into a buffer + // where we can lex it. + SourceLocation TokLoc = CreateString(&StrVal[0], StrVal.size(), StrLoc); + const char *StrData = SourceMgr.getCharacterData(TokLoc); + + // Make and enter a lexer object so that we lex and expand the tokens just + // like any others. + Lexer *TL = new Lexer(TokLoc, *this, + StrData, StrData+StrVal.size()-1 /* no null */); + + // Ensure that the lexer thinks it is inside a directive, so that end \n will + // return an EOM token. + TL->ParsingPreprocessorDirective = true; + + // This lexer really is for _Pragma. + TL->Is_PragmaLexer = true; + + EnterSourceFileWithLexer(TL, 0); + + // With everything set up, lex this as a #pragma directive. + HandlePragmaDirective(); + + // Finally, return whatever came after the pragma directive. + return Lex(Tok); +} + + + +/// HandlePragmaOnce - Handle #pragma once. OnceTok is the 'once'. +/// +void Preprocessor::HandlePragmaOnce(Token &OnceTok) { + if (isInPrimaryFile()) { + Diag(OnceTok, diag::pp_pragma_once_in_main_file); + return; + } + + // Get the current file lexer we're looking at. Ignore _Pragma 'files' etc. + SourceLocation FileLoc = getCurrentFileLexer()->getFileLoc(); + + // Mark the file as a once-only file now. + HeaderInfo.MarkFileIncludeOnce(SourceMgr.getFileEntryForLoc(FileLoc)); +} + +void Preprocessor::HandlePragmaMark() { + assert(CurLexer && "No current lexer?"); + CurLexer->ReadToEndOfLine(); +} + + +/// HandlePragmaPoison - Handle #pragma GCC poison. PoisonTok is the 'poison'. +/// +void Preprocessor::HandlePragmaPoison(Token &PoisonTok) { + Token Tok; + + while (1) { + // Read the next token to poison. While doing this, pretend that we are + // skipping while reading the identifier to poison. + // This avoids errors on code like: + // #pragma GCC poison X + // #pragma GCC poison X + if (CurLexer) CurLexer->LexingRawMode = true; + LexUnexpandedToken(Tok); + if (CurLexer) CurLexer->LexingRawMode = false; + + // If we reached the end of line, we're done. + if (Tok.is(tok::eom)) return; + + // Can only poison identifiers. + if (Tok.isNot(tok::identifier)) { + Diag(Tok, diag::err_pp_invalid_poison); + return; + } + + // Look up the identifier info for the token. We disabled identifier lookup + // by saying we're skipping contents, so we need to do this manually. + IdentifierInfo *II = LookUpIdentifierInfo(Tok); + + // Already poisoned. + if (II->isPoisoned()) continue; + + // If this is a macro identifier, emit a warning. + if (II->hasMacroDefinition()) + Diag(Tok, diag::pp_poisoning_existing_macro); + + // Finally, poison it! + II->setIsPoisoned(); + } +} + +/// HandlePragmaSystemHeader - Implement #pragma GCC system_header. We know +/// that the whole directive has been parsed. +void Preprocessor::HandlePragmaSystemHeader(Token &SysHeaderTok) { + if (isInPrimaryFile()) { + Diag(SysHeaderTok, diag::pp_pragma_sysheader_in_main_file); + return; + } + + // Get the current file lexer we're looking at. Ignore _Pragma 'files' etc. + Lexer *TheLexer = getCurrentFileLexer(); + + // Mark the file as a system header. + const FileEntry *File = SourceMgr.getFileEntryForLoc(TheLexer->getFileLoc()); + HeaderInfo.MarkFileSystemHeader(File); + + // Notify the client, if desired, that we are in a new source file. + if (Callbacks) + Callbacks->FileChanged(TheLexer->getSourceLocation(TheLexer->BufferPtr), + PPCallbacks::SystemHeaderPragma, + DirectoryLookup::SystemHeaderDir); +} + +/// HandlePragmaDependency - Handle #pragma GCC dependency "foo" blah. +/// +void Preprocessor::HandlePragmaDependency(Token &DependencyTok) { + Token FilenameTok; + CurLexer->LexIncludeFilename(FilenameTok); + + // If the token kind is EOM, the error has already been diagnosed. + if (FilenameTok.is(tok::eom)) + return; + + // Reserve a buffer to get the spelling. + llvm::SmallVector<char, 128> FilenameBuffer; + FilenameBuffer.resize(FilenameTok.getLength()); + + const char *FilenameStart = &FilenameBuffer[0]; + unsigned Len = getSpelling(FilenameTok, FilenameStart); + const char *FilenameEnd = FilenameStart+Len; + bool isAngled = GetIncludeFilenameSpelling(FilenameTok.getLocation(), + FilenameStart, FilenameEnd); + // If GetIncludeFilenameSpelling set the start ptr to null, there was an + // error. + if (FilenameStart == 0) + return; + + // Search include directories for this file. + const DirectoryLookup *CurDir; + const FileEntry *File = LookupFile(FilenameStart, FilenameEnd, + isAngled, 0, CurDir); + if (File == 0) + return Diag(FilenameTok, diag::err_pp_file_not_found, + std::string(FilenameStart, FilenameEnd)); + + SourceLocation FileLoc = getCurrentFileLexer()->getFileLoc(); + const FileEntry *CurFile = SourceMgr.getFileEntryForLoc(FileLoc); + + // If this file is older than the file it depends on, emit a diagnostic. + if (CurFile && CurFile->getModificationTime() < File->getModificationTime()) { + // Lex tokens at the end of the message and include them in the message. + std::string Message; + Lex(DependencyTok); + while (DependencyTok.isNot(tok::eom)) { + Message += getSpelling(DependencyTok) + " "; + Lex(DependencyTok); + } + + Message.erase(Message.end()-1); + Diag(FilenameTok, diag::pp_out_of_date_dependency, Message); + } +} + + +/// AddPragmaHandler - Add the specified pragma handler to the preprocessor. +/// If 'Namespace' is non-null, then it is a token required to exist on the +/// pragma line before the pragma string starts, e.g. "STDC" or "GCC". +void Preprocessor::AddPragmaHandler(const char *Namespace, + PragmaHandler *Handler) { + PragmaNamespace *InsertNS = PragmaHandlers; + + // If this is specified to be in a namespace, step down into it. + if (Namespace) { + IdentifierInfo *NSID = getIdentifierInfo(Namespace); + + // If there is already a pragma handler with the name of this namespace, + // we either have an error (directive with the same name as a namespace) or + // we already have the namespace to insert into. + if (PragmaHandler *Existing = PragmaHandlers->FindHandler(NSID)) { + InsertNS = Existing->getIfNamespace(); + assert(InsertNS != 0 && "Cannot have a pragma namespace and pragma" + " handler with the same name!"); + } else { + // Otherwise, this namespace doesn't exist yet, create and insert the + // handler for it. + InsertNS = new PragmaNamespace(NSID); + PragmaHandlers->AddPragma(InsertNS); + } + } + + // Check to make sure we don't already have a pragma for this identifier. + assert(!InsertNS->FindHandler(Handler->getName()) && + "Pragma handler already exists for this identifier!"); + InsertNS->AddPragma(Handler); +} + +namespace { +/// PragmaOnceHandler - "#pragma once" marks the file as atomically included. +struct PragmaOnceHandler : public PragmaHandler { + PragmaOnceHandler(const IdentifierInfo *OnceID) : PragmaHandler(OnceID) {} + virtual void HandlePragma(Preprocessor &PP, Token &OnceTok) { + PP.CheckEndOfDirective("#pragma once"); + PP.HandlePragmaOnce(OnceTok); + } +}; + +/// PragmaMarkHandler - "#pragma mark ..." is ignored by the compiler, and the +/// rest of the line is not lexed. +struct PragmaMarkHandler : public PragmaHandler { + PragmaMarkHandler(const IdentifierInfo *MarkID) : PragmaHandler(MarkID) {} + virtual void HandlePragma(Preprocessor &PP, Token &MarkTok) { + PP.HandlePragmaMark(); + } +}; + +/// PragmaPoisonHandler - "#pragma poison x" marks x as not usable. +struct PragmaPoisonHandler : public PragmaHandler { + PragmaPoisonHandler(const IdentifierInfo *ID) : PragmaHandler(ID) {} + virtual void HandlePragma(Preprocessor &PP, Token &PoisonTok) { + PP.HandlePragmaPoison(PoisonTok); + } +}; + +/// PragmaSystemHeaderHandler - "#pragma system_header" marks the current file +/// as a system header, which silences warnings in it. +struct PragmaSystemHeaderHandler : public PragmaHandler { + PragmaSystemHeaderHandler(const IdentifierInfo *ID) : PragmaHandler(ID) {} + virtual void HandlePragma(Preprocessor &PP, Token &SHToken) { + PP.HandlePragmaSystemHeader(SHToken); + PP.CheckEndOfDirective("#pragma"); + } +}; +struct PragmaDependencyHandler : public PragmaHandler { + PragmaDependencyHandler(const IdentifierInfo *ID) : PragmaHandler(ID) {} + virtual void HandlePragma(Preprocessor &PP, Token &DepToken) { + PP.HandlePragmaDependency(DepToken); + } +}; +} // end anonymous namespace + + +/// RegisterBuiltinPragmas - Install the standard preprocessor pragmas: +/// #pragma GCC poison/system_header/dependency and #pragma once. +void Preprocessor::RegisterBuiltinPragmas() { + AddPragmaHandler(0, new PragmaOnceHandler(getIdentifierInfo("once"))); + AddPragmaHandler(0, new PragmaMarkHandler(getIdentifierInfo("mark"))); + AddPragmaHandler("GCC", new PragmaPoisonHandler(getIdentifierInfo("poison"))); + AddPragmaHandler("GCC", new PragmaSystemHeaderHandler( + getIdentifierInfo("system_header"))); + AddPragmaHandler("GCC", new PragmaDependencyHandler( + getIdentifierInfo("dependency"))); +} diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp new file mode 100644 index 00000000000..86156a07728 --- /dev/null +++ b/clang/lib/Lex/Preprocessor.cpp @@ -0,0 +1,560 @@ +//===--- Preprocess.cpp - C Language Family Preprocessor Implementation ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Preprocessor interface. +// +//===----------------------------------------------------------------------===// +// +// Options to support: +// -H - Print the name of each header file used. +// -d[MDNI] - Dump various things. +// -fworking-directory - #line's with preprocessor's working dir. +// -fpreprocessed +// -dependency-file,-M,-MM,-MF,-MG,-MP,-MT,-MQ,-MD,-MMD +// -W* +// -w +// +// Messages to emit: +// "Multiple include guards may be useful for:\n" +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Lex/Pragma.h" +#include "clang/Lex/ScratchBuffer.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TargetInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Streams.h" +#include <ctime> +using namespace clang; + +//===----------------------------------------------------------------------===// + +Preprocessor::Preprocessor(Diagnostic &diags, const LangOptions &opts, + TargetInfo &target, SourceManager &SM, + HeaderSearch &Headers) + : Diags(diags), Features(opts), Target(target), FileMgr(Headers.getFileMgr()), + SourceMgr(SM), HeaderInfo(Headers), Identifiers(opts), + CurLexer(0), CurDirLookup(0), CurTokenLexer(0), Callbacks(0) { + ScratchBuf = new ScratchBuffer(SourceMgr); + + // Clear stats. + NumDirectives = NumDefined = NumUndefined = NumPragma = 0; + NumIf = NumElse = NumEndif = 0; + NumEnteredSourceFiles = 0; + NumMacroExpanded = NumFnMacroExpanded = NumBuiltinMacroExpanded = 0; + NumFastMacroExpanded = NumTokenPaste = NumFastTokenPaste = 0; + MaxIncludeStackDepth = 0; + NumSkipped = 0; + + // Default to discarding comments. + KeepComments = false; + KeepMacroComments = false; + + // Macro expansion is enabled. + DisableMacroExpansion = false; + InMacroArgs = false; + NumCachedTokenLexers = 0; + + // "Poison" __VA_ARGS__, which can only appear in the expansion of a macro. + // This gets unpoisoned where it is allowed. + (Ident__VA_ARGS__ = getIdentifierInfo("__VA_ARGS__"))->setIsPoisoned(); + + Predefines = 0; + + // Initialize the pragma handlers. + PragmaHandlers = new PragmaNamespace(0); + RegisterBuiltinPragmas(); + + // Initialize builtin macros like __LINE__ and friends. + RegisterBuiltinMacros(); +} + +Preprocessor::~Preprocessor() { + // Free any active lexers. + delete CurLexer; + + while (!IncludeMacroStack.empty()) { + delete IncludeMacroStack.back().TheLexer; + delete IncludeMacroStack.back().TheTokenLexer; + IncludeMacroStack.pop_back(); + } + + // Free any macro definitions. + for (llvm::DenseMap<IdentifierInfo*, MacroInfo*>::iterator I = + Macros.begin(), E = Macros.end(); I != E; ++I) { + // Free the macro definition. + delete I->second; + I->second = 0; + I->first->setHasMacroDefinition(false); + } + + // Free any cached macro expanders. + for (unsigned i = 0, e = NumCachedTokenLexers; i != e; ++i) + delete TokenLexerCache[i]; + + // Release pragma information. + delete PragmaHandlers; + + // Delete the scratch buffer info. + delete ScratchBuf; + + delete Callbacks; +} + +/// Diag - Forwarding function for diagnostics. This emits a diagnostic at +/// the specified Token's location, translating the token's start +/// position in the current buffer into a SourcePosition object for rendering. +void Preprocessor::Diag(SourceLocation Loc, unsigned DiagID) { + Diags.Report(getFullLoc(Loc), DiagID); +} + +void Preprocessor::Diag(SourceLocation Loc, unsigned DiagID, + const std::string &Msg) { + Diags.Report(getFullLoc(Loc), DiagID, &Msg, 1); +} + +void Preprocessor::DumpToken(const Token &Tok, bool DumpFlags) const { + llvm::cerr << tok::getTokenName(Tok.getKind()) << " '" + << getSpelling(Tok) << "'"; + + if (!DumpFlags) return; + + llvm::cerr << "\t"; + if (Tok.isAtStartOfLine()) + llvm::cerr << " [StartOfLine]"; + if (Tok.hasLeadingSpace()) + llvm::cerr << " [LeadingSpace]"; + if (Tok.isExpandDisabled()) + llvm::cerr << " [ExpandDisabled]"; + if (Tok.needsCleaning()) { + const char *Start = SourceMgr.getCharacterData(Tok.getLocation()); + llvm::cerr << " [UnClean='" << std::string(Start, Start+Tok.getLength()) + << "']"; + } + + llvm::cerr << "\tLoc=<"; + DumpLocation(Tok.getLocation()); + llvm::cerr << ">"; +} + +void Preprocessor::DumpLocation(SourceLocation Loc) const { + SourceLocation LogLoc = SourceMgr.getLogicalLoc(Loc); + llvm::cerr << SourceMgr.getSourceName(LogLoc) << ':' + << SourceMgr.getLineNumber(LogLoc) << ':' + << SourceMgr.getLineNumber(LogLoc); + + SourceLocation PhysLoc = SourceMgr.getPhysicalLoc(Loc); + if (PhysLoc != LogLoc) { + llvm::cerr << " <PhysLoc="; + DumpLocation(PhysLoc); + llvm::cerr << ">"; + } +} + +void Preprocessor::DumpMacro(const MacroInfo &MI) const { + llvm::cerr << "MACRO: "; + for (unsigned i = 0, e = MI.getNumTokens(); i != e; ++i) { + DumpToken(MI.getReplacementToken(i)); + llvm::cerr << " "; + } + llvm::cerr << "\n"; +} + +void Preprocessor::PrintStats() { + llvm::cerr << "\n*** Preprocessor Stats:\n"; + llvm::cerr << NumDirectives << " directives found:\n"; + llvm::cerr << " " << NumDefined << " #define.\n"; + llvm::cerr << " " << NumUndefined << " #undef.\n"; + llvm::cerr << " #include/#include_next/#import:\n"; + llvm::cerr << " " << NumEnteredSourceFiles << " source files entered.\n"; + llvm::cerr << " " << MaxIncludeStackDepth << " max include stack depth\n"; + llvm::cerr << " " << NumIf << " #if/#ifndef/#ifdef.\n"; + llvm::cerr << " " << NumElse << " #else/#elif.\n"; + llvm::cerr << " " << NumEndif << " #endif.\n"; + llvm::cerr << " " << NumPragma << " #pragma.\n"; + llvm::cerr << NumSkipped << " #if/#ifndef#ifdef regions skipped\n"; + + llvm::cerr << NumMacroExpanded << "/" << NumFnMacroExpanded << "/" + << NumBuiltinMacroExpanded << " obj/fn/builtin macros expanded, " + << NumFastMacroExpanded << " on the fast path.\n"; + llvm::cerr << (NumFastTokenPaste+NumTokenPaste) + << " token paste (##) operations performed, " + << NumFastTokenPaste << " on the fast path.\n"; +} + +//===----------------------------------------------------------------------===// +// Token Spelling +//===----------------------------------------------------------------------===// + + +/// getSpelling() - Return the 'spelling' of this token. The spelling of a +/// token are the characters used to represent the token in the source file +/// after trigraph expansion and escaped-newline folding. In particular, this +/// wants to get the true, uncanonicalized, spelling of things like digraphs +/// UCNs, etc. +std::string Preprocessor::getSpelling(const Token &Tok) const { + assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); + + // If this token contains nothing interesting, return it directly. + const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation()); + if (!Tok.needsCleaning()) + return std::string(TokStart, TokStart+Tok.getLength()); + + std::string Result; + Result.reserve(Tok.getLength()); + + // Otherwise, hard case, relex the characters into the string. + for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); + Ptr != End; ) { + unsigned CharSize; + Result.push_back(Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features)); + Ptr += CharSize; + } + assert(Result.size() != unsigned(Tok.getLength()) && + "NeedsCleaning flag set on something that didn't need cleaning!"); + return Result; +} + +/// getSpelling - This method is used to get the spelling of a token into a +/// preallocated buffer, instead of as an std::string. The caller is required +/// to allocate enough space for the token, which is guaranteed to be at least +/// Tok.getLength() bytes long. The actual length of the token is returned. +/// +/// Note that this method may do two possible things: it may either fill in +/// the buffer specified with characters, or it may *change the input pointer* +/// to point to a constant buffer with the data already in it (avoiding a +/// copy). The caller is not allowed to modify the returned buffer pointer +/// if an internal buffer is returned. +unsigned Preprocessor::getSpelling(const Token &Tok, + const char *&Buffer) const { + assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); + + // If this token is an identifier, just return the string from the identifier + // table, which is very quick. + if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { + Buffer = II->getName(); + + // Return the length of the token. If the token needed cleaning, don't + // include the size of the newlines or trigraphs in it. + if (!Tok.needsCleaning()) + return Tok.getLength(); + else + return strlen(Buffer); + } + + // Otherwise, compute the start of the token in the input lexer buffer. + const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation()); + + // If this token contains nothing interesting, return it directly. + if (!Tok.needsCleaning()) { + Buffer = TokStart; + return Tok.getLength(); + } + // Otherwise, hard case, relex the characters into the string. + char *OutBuf = const_cast<char*>(Buffer); + for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); + Ptr != End; ) { + unsigned CharSize; + *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features); + Ptr += CharSize; + } + assert(unsigned(OutBuf-Buffer) != Tok.getLength() && + "NeedsCleaning flag set on something that didn't need cleaning!"); + + return OutBuf-Buffer; +} + + +/// CreateString - Plop the specified string into a scratch buffer and return a +/// location for it. If specified, the source location provides a source +/// location for the token. +SourceLocation Preprocessor:: +CreateString(const char *Buf, unsigned Len, SourceLocation SLoc) { + if (SLoc.isValid()) + return ScratchBuf->getToken(Buf, Len, SLoc); + return ScratchBuf->getToken(Buf, Len); +} + + +/// AdvanceToTokenCharacter - Given a location that specifies the start of a +/// token, return a new location that specifies a character within the token. +SourceLocation Preprocessor::AdvanceToTokenCharacter(SourceLocation TokStart, + unsigned CharNo) { + // If they request the first char of the token, we're trivially done. If this + // is a macro expansion, it doesn't make sense to point to a character within + // the instantiation point (the name). We could point to the source + // character, but without also pointing to instantiation info, this is + // confusing. + if (CharNo == 0 || TokStart.isMacroID()) return TokStart; + + // Figure out how many physical characters away the specified logical + // character is. This needs to take into consideration newlines and + // trigraphs. + const char *TokPtr = SourceMgr.getCharacterData(TokStart); + unsigned PhysOffset = 0; + + // The usual case is that tokens don't contain anything interesting. Skip + // over the uninteresting characters. If a token only consists of simple + // chars, this method is extremely fast. + while (CharNo && Lexer::isObviouslySimpleCharacter(*TokPtr)) + ++TokPtr, --CharNo, ++PhysOffset; + + // If we have a character that may be a trigraph or escaped newline, create a + // lexer to parse it correctly. + if (CharNo != 0) { + // Create a lexer starting at this token position. + Lexer TheLexer(TokStart, *this, TokPtr); + Token Tok; + // Skip over characters the remaining characters. + const char *TokStartPtr = TokPtr; + for (; CharNo; --CharNo) + TheLexer.getAndAdvanceChar(TokPtr, Tok); + + PhysOffset += TokPtr-TokStartPtr; + } + + return TokStart.getFileLocWithOffset(PhysOffset); +} + + +//===----------------------------------------------------------------------===// +// Preprocessor Initialization Methods +//===----------------------------------------------------------------------===// + +// Append a #define line to Buf for Macro. Macro should be of the form XXX, +// in which case we emit "#define XXX 1" or "XXX=Y z W" in which case we emit +// "#define XXX Y z W". To get a #define with no value, use "XXX=". +static void DefineBuiltinMacro(std::vector<char> &Buf, const char *Macro, + const char *Command = "#define ") { + Buf.insert(Buf.end(), Command, Command+strlen(Command)); + if (const char *Equal = strchr(Macro, '=')) { + // Turn the = into ' '. + Buf.insert(Buf.end(), Macro, Equal); + Buf.push_back(' '); + Buf.insert(Buf.end(), Equal+1, Equal+strlen(Equal)); + } else { + // Push "macroname 1". + Buf.insert(Buf.end(), Macro, Macro+strlen(Macro)); + Buf.push_back(' '); + Buf.push_back('1'); + } + Buf.push_back('\n'); +} + + +static void InitializePredefinedMacros(Preprocessor &PP, + std::vector<char> &Buf) { + // FIXME: Implement magic like cpp_init_builtins for things like __STDC__ + // and __DATE__ etc. +#if 0 + /* __STDC__ has the value 1 under normal circumstances. + However, if (a) we are in a system header, (b) the option + stdc_0_in_system_headers is true (set by target config), and + (c) we are not in strictly conforming mode, then it has the + value 0. (b) and (c) are already checked in cpp_init_builtins. */ + //case BT_STDC: + if (cpp_in_system_header (pfile)) + number = 0; + else + number = 1; + break; +#endif + // These should all be defined in the preprocessor according to the + // current language configuration. + DefineBuiltinMacro(Buf, "__STDC__=1"); + //DefineBuiltinMacro(Buf, "__ASSEMBLER__=1"); + if (PP.getLangOptions().C99 && !PP.getLangOptions().CPlusPlus) + DefineBuiltinMacro(Buf, "__STDC_VERSION__=199901L"); + else if (0) // STDC94 ? + DefineBuiltinMacro(Buf, "__STDC_VERSION__=199409L"); + + DefineBuiltinMacro(Buf, "__STDC_HOSTED__=1"); + if (PP.getLangOptions().ObjC1) + DefineBuiltinMacro(Buf, "__OBJC__=1"); + if (PP.getLangOptions().ObjC2) + DefineBuiltinMacro(Buf, "__OBJC2__=1"); + + // Add __builtin_va_list typedef. + { + const char *VAList = PP.getTargetInfo().getVAListDeclaration(); + Buf.insert(Buf.end(), VAList, VAList+strlen(VAList)); + Buf.push_back('\n'); + } + + // Get the target #defines. + PP.getTargetInfo().getTargetDefines(Buf); + + // Compiler set macros. + DefineBuiltinMacro(Buf, "__APPLE_CC__=5250"); + DefineBuiltinMacro(Buf, "__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__=1050"); + DefineBuiltinMacro(Buf, "__GNUC_MINOR__=0"); + DefineBuiltinMacro(Buf, "__GNUC_PATCHLEVEL__=1"); + DefineBuiltinMacro(Buf, "__GNUC__=4"); + DefineBuiltinMacro(Buf, "__GXX_ABI_VERSION=1002"); + DefineBuiltinMacro(Buf, "__VERSION__=\"4.0.1 (Apple Computer, Inc. " + "build 5250)\""); + + // Build configuration options. + DefineBuiltinMacro(Buf, "__DYNAMIC__=1"); + DefineBuiltinMacro(Buf, "__FINITE_MATH_ONLY__=0"); + DefineBuiltinMacro(Buf, "__NO_INLINE__=1"); + DefineBuiltinMacro(Buf, "__PIC__=1"); + + + if (PP.getLangOptions().CPlusPlus) { + DefineBuiltinMacro(Buf, "__DEPRECATED=1"); + DefineBuiltinMacro(Buf, "__EXCEPTIONS=1"); + DefineBuiltinMacro(Buf, "__GNUG__=4"); + DefineBuiltinMacro(Buf, "__GXX_WEAK__=1"); + DefineBuiltinMacro(Buf, "__cplusplus=1"); + DefineBuiltinMacro(Buf, "__private_extern__=extern"); + } + if (PP.getLangOptions().Microsoft) { + DefineBuiltinMacro(Buf, "__stdcall="); + DefineBuiltinMacro(Buf, "__cdecl="); + DefineBuiltinMacro(Buf, "_cdecl="); + DefineBuiltinMacro(Buf, "__ptr64="); + DefineBuiltinMacro(Buf, "__w64="); + DefineBuiltinMacro(Buf, "__forceinline="); + DefineBuiltinMacro(Buf, "__int8=char"); + DefineBuiltinMacro(Buf, "__int16=short"); + DefineBuiltinMacro(Buf, "__int32=int"); + DefineBuiltinMacro(Buf, "__int64=long long"); + DefineBuiltinMacro(Buf, "__declspec(X)="); + } + // FIXME: Should emit a #line directive here. +} + + +/// EnterMainSourceFile - Enter the specified FileID as the main source file, +/// which implicitly adds the builtin defines etc. +void Preprocessor::EnterMainSourceFile() { + + unsigned MainFileID = SourceMgr.getMainFileID(); + + // Enter the main file source buffer. + EnterSourceFile(MainFileID, 0); + + // Tell the header info that the main file was entered. If the file is later + // #imported, it won't be re-entered. + if (const FileEntry *FE = + SourceMgr.getFileEntryForLoc(SourceLocation::getFileLoc(MainFileID, 0))) + HeaderInfo.IncrementIncludeCount(FE); + + std::vector<char> PrologFile; + PrologFile.reserve(4080); + + // Install things like __POWERPC__, __GNUC__, etc into the macro table. + InitializePredefinedMacros(*this, PrologFile); + + // Add on the predefines from the driver. + PrologFile.insert(PrologFile.end(), Predefines,Predefines+strlen(Predefines)); + + // Memory buffer must end with a null byte! + PrologFile.push_back(0); + + // Now that we have emitted the predefined macros, #includes, etc into + // PrologFile, preprocess it to populate the initial preprocessor state. + llvm::MemoryBuffer *SB = + llvm::MemoryBuffer::getMemBufferCopy(&PrologFile.front(),&PrologFile.back(), + "<predefines>"); + assert(SB && "Cannot fail to create predefined source buffer"); + unsigned FileID = SourceMgr.createFileIDForMemBuffer(SB); + assert(FileID && "Could not create FileID for predefines?"); + + // Start parsing the predefines. + EnterSourceFile(FileID, 0); +} + + +//===----------------------------------------------------------------------===// +// Lexer Event Handling. +//===----------------------------------------------------------------------===// + +/// LookUpIdentifierInfo - Given a tok::identifier token, look up the +/// identifier information for the token and install it into the token. +IdentifierInfo *Preprocessor::LookUpIdentifierInfo(Token &Identifier, + const char *BufPtr) { + assert(Identifier.is(tok::identifier) && "Not an identifier!"); + assert(Identifier.getIdentifierInfo() == 0 && "Identinfo already exists!"); + + // Look up this token, see if it is a macro, or if it is a language keyword. + IdentifierInfo *II; + if (BufPtr && !Identifier.needsCleaning()) { + // No cleaning needed, just use the characters from the lexed buffer. + II = getIdentifierInfo(BufPtr, BufPtr+Identifier.getLength()); + } else { + // Cleaning needed, alloca a buffer, clean into it, then use the buffer. + llvm::SmallVector<char, 64> IdentifierBuffer; + IdentifierBuffer.resize(Identifier.getLength()); + const char *TmpBuf = &IdentifierBuffer[0]; + unsigned Size = getSpelling(Identifier, TmpBuf); + II = getIdentifierInfo(TmpBuf, TmpBuf+Size); + } + Identifier.setIdentifierInfo(II); + return II; +} + + +/// HandleIdentifier - This callback is invoked when the lexer reads an +/// identifier. This callback looks up the identifier in the map and/or +/// potentially macro expands it or turns it into a named token (like 'for'). +void Preprocessor::HandleIdentifier(Token &Identifier) { + assert(Identifier.getIdentifierInfo() && + "Can't handle identifiers without identifier info!"); + + IdentifierInfo &II = *Identifier.getIdentifierInfo(); + + // If this identifier was poisoned, and if it was not produced from a macro + // expansion, emit an error. + if (II.isPoisoned() && CurLexer) { + if (&II != Ident__VA_ARGS__) // We warn about __VA_ARGS__ with poisoning. + Diag(Identifier, diag::err_pp_used_poisoned_id); + else + Diag(Identifier, diag::ext_pp_bad_vaargs_use); + } + + // If this is a macro to be expanded, do it. + if (MacroInfo *MI = getMacroInfo(&II)) { + if (!DisableMacroExpansion && !Identifier.isExpandDisabled()) { + if (MI->isEnabled()) { + if (!HandleMacroExpandedIdentifier(Identifier, MI)) + return; + } else { + // C99 6.10.3.4p2 says that a disabled macro may never again be + // expanded, even if it's in a context where it could be expanded in the + // future. + Identifier.setFlag(Token::DisableExpand); + } + } + } + + // C++ 2.11p2: If this is an alternative representation of a C++ operator, + // then we act as if it is the actual operator and not the textual + // representation of it. + if (II.isCPlusPlusOperatorKeyword()) + Identifier.setIdentifierInfo(0); + + // Change the kind of this identifier to the appropriate token kind, e.g. + // turning "for" into a keyword. + Identifier.setKind(II.getTokenID()); + + // If this is an extension token, diagnose its use. + // FIXME: tried (unsuccesfully) to shut this up when compiling with gnu99 + // For now, I'm just commenting it out (while I work on attributes). + if (II.isExtensionToken() && Features.C99) + Diag(Identifier, diag::ext_token_used); +} + diff --git a/clang/lib/Lex/ScratchBuffer.cpp b/clang/lib/Lex/ScratchBuffer.cpp new file mode 100644 index 00000000000..99fbdf75654 --- /dev/null +++ b/clang/lib/Lex/ScratchBuffer.cpp @@ -0,0 +1,72 @@ +//===--- ScratchBuffer.cpp - Scratch space for forming tokens -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ScratchBuffer interface. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/ScratchBuffer.h" +#include "clang/Basic/SourceManager.h" +#include "llvm/Support/MemoryBuffer.h" +#include <cstring> +using namespace clang; + +// ScratchBufSize - The size of each chunk of scratch memory. Slightly less +//than a page, almost certainly enough for anything. :) +static const unsigned ScratchBufSize = 4060; + +ScratchBuffer::ScratchBuffer(SourceManager &SM) : SourceMgr(SM), CurBuffer(0) { + // Set BytesUsed so that the first call to getToken will require an alloc. + BytesUsed = ScratchBufSize; + FileID = 0; +} + +/// getToken - Splat the specified text into a temporary MemoryBuffer and +/// return a SourceLocation that refers to the token. This is just like the +/// method below, but returns a location that indicates the physloc of the +/// token. +SourceLocation ScratchBuffer::getToken(const char *Buf, unsigned Len) { + if (BytesUsed+Len > ScratchBufSize) + AllocScratchBuffer(Len); + + // Copy the token data into the buffer. + memcpy(CurBuffer+BytesUsed, Buf, Len); + + // Remember that we used these bytes. + BytesUsed += Len; + + assert(BytesUsed-Len < (1 << SourceLocation::FilePosBits) && + "Out of range file position!"); + + return SourceLocation::getFileLoc(FileID, BytesUsed-Len); +} + + +/// getToken - Splat the specified text into a temporary MemoryBuffer and +/// return a SourceLocation that refers to the token. The SourceLoc value +/// gives a virtual location that the token will appear to be from. +SourceLocation ScratchBuffer::getToken(const char *Buf, unsigned Len, + SourceLocation SourceLoc) { + // Map the physloc to the specified sourceloc. + return SourceMgr.getInstantiationLoc(getToken(Buf, Len), SourceLoc); +} + +void ScratchBuffer::AllocScratchBuffer(unsigned RequestLen) { + // Only pay attention to the requested length if it is larger than our default + // page size. If it is, we allocate an entire chunk for it. This is to + // support gigantic tokens, which almost certainly won't happen. :) + if (RequestLen < ScratchBufSize) + RequestLen = ScratchBufSize; + + llvm::MemoryBuffer *Buf = + llvm::MemoryBuffer::getNewMemBuffer(RequestLen, "<scratch space>"); + FileID = SourceMgr.createFileIDForMemBuffer(Buf); + CurBuffer = const_cast<char*>(Buf->getBufferStart()); + BytesUsed = 0; +} diff --git a/clang/lib/Lex/TokenLexer.cpp b/clang/lib/Lex/TokenLexer.cpp new file mode 100644 index 00000000000..fc8cfd715c4 --- /dev/null +++ b/clang/lib/Lex/TokenLexer.cpp @@ -0,0 +1,488 @@ +//===--- TokenLexer.cpp - Lex from a token stream -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the TokenLexer interface. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/TokenLexer.h" +#include "MacroArgs.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/Diagnostic.h" +#include "llvm/ADT/SmallVector.h" +using namespace clang; + + +/// Create a TokenLexer for the specified macro with the specified actual +/// arguments. Note that this ctor takes ownership of the ActualArgs pointer. +void TokenLexer::Init(Token &Tok, MacroArgs *Actuals) { + // If the client is reusing a TokenLexer, make sure to free any memory + // associated with it. + destroy(); + + Macro = PP.getMacroInfo(Tok.getIdentifierInfo()); + ActualArgs = Actuals; + CurToken = 0; + InstantiateLoc = Tok.getLocation(); + AtStartOfLine = Tok.isAtStartOfLine(); + HasLeadingSpace = Tok.hasLeadingSpace(); + Tokens = &*Macro->tokens_begin(); + OwnsTokens = false; + DisableMacroExpansion = false; + NumTokens = Macro->tokens_end()-Macro->tokens_begin(); + + // If this is a function-like macro, expand the arguments and change + // Tokens to point to the expanded tokens. + if (Macro->isFunctionLike() && Macro->getNumArgs()) + ExpandFunctionArguments(); + + // Mark the macro as currently disabled, so that it is not recursively + // expanded. The macro must be disabled only after argument pre-expansion of + // function-like macro arguments occurs. + Macro->DisableMacro(); +} + + + +/// Create a TokenLexer for the specified token stream. This does not +/// take ownership of the specified token vector. +void TokenLexer::Init(const Token *TokArray, unsigned NumToks, + bool disableMacroExpansion, bool ownsTokens) { + // If the client is reusing a TokenLexer, make sure to free any memory + // associated with it. + destroy(); + + Macro = 0; + ActualArgs = 0; + Tokens = TokArray; + OwnsTokens = ownsTokens; + DisableMacroExpansion = disableMacroExpansion; + NumTokens = NumToks; + CurToken = 0; + InstantiateLoc = SourceLocation(); + AtStartOfLine = false; + HasLeadingSpace = false; + + // Set HasLeadingSpace/AtStartOfLine so that the first token will be + // returned unmodified. + if (NumToks != 0) { + AtStartOfLine = TokArray[0].isAtStartOfLine(); + HasLeadingSpace = TokArray[0].hasLeadingSpace(); + } +} + + +void TokenLexer::destroy() { + // If this was a function-like macro that actually uses its arguments, delete + // the expanded tokens. + if (OwnsTokens) { + delete [] Tokens; + Tokens = 0; + } + + // TokenLexer owns its formal arguments. + if (ActualArgs) ActualArgs->destroy(); +} + +/// Expand the arguments of a function-like macro so that we can quickly +/// return preexpanded tokens from Tokens. +void TokenLexer::ExpandFunctionArguments() { + llvm::SmallVector<Token, 128> ResultToks; + + // Loop through 'Tokens', expanding them into ResultToks. Keep + // track of whether we change anything. If not, no need to keep them. If so, + // we install the newly expanded sequence as the new 'Tokens' list. + bool MadeChange = false; + + // NextTokGetsSpace - When this is true, the next token appended to the + // output list will get a leading space, regardless of whether it had one to + // begin with or not. This is used for placemarker support. + bool NextTokGetsSpace = false; + + for (unsigned i = 0, e = NumTokens; i != e; ++i) { + // If we found the stringify operator, get the argument stringified. The + // preprocessor already verified that the following token is a macro name + // when the #define was parsed. + const Token &CurTok = Tokens[i]; + if (CurTok.is(tok::hash) || CurTok.is(tok::hashat)) { + int ArgNo = Macro->getArgumentNum(Tokens[i+1].getIdentifierInfo()); + assert(ArgNo != -1 && "Token following # is not an argument?"); + + Token Res; + if (CurTok.is(tok::hash)) // Stringify + Res = ActualArgs->getStringifiedArgument(ArgNo, PP); + else { + // 'charify': don't bother caching these. + Res = MacroArgs::StringifyArgument(ActualArgs->getUnexpArgument(ArgNo), + PP, true); + } + + // The stringified/charified string leading space flag gets set to match + // the #/#@ operator. + if (CurTok.hasLeadingSpace() || NextTokGetsSpace) + Res.setFlag(Token::LeadingSpace); + + ResultToks.push_back(Res); + MadeChange = true; + ++i; // Skip arg name. + NextTokGetsSpace = false; + continue; + } + + // Otherwise, if this is not an argument token, just add the token to the + // output buffer. + IdentifierInfo *II = CurTok.getIdentifierInfo(); + int ArgNo = II ? Macro->getArgumentNum(II) : -1; + if (ArgNo == -1) { + // This isn't an argument, just add it. + ResultToks.push_back(CurTok); + + if (NextTokGetsSpace) { + ResultToks.back().setFlag(Token::LeadingSpace); + NextTokGetsSpace = false; + } + continue; + } + + // An argument is expanded somehow, the result is different than the + // input. + MadeChange = true; + + // Otherwise, this is a use of the argument. Find out if there is a paste + // (##) operator before or after the argument. + bool PasteBefore = + !ResultToks.empty() && ResultToks.back().is(tok::hashhash); + bool PasteAfter = i+1 != e && Tokens[i+1].is(tok::hashhash); + + // If it is not the LHS/RHS of a ## operator, we must pre-expand the + // argument and substitute the expanded tokens into the result. This is + // C99 6.10.3.1p1. + if (!PasteBefore && !PasteAfter) { + const Token *ResultArgToks; + + // Only preexpand the argument if it could possibly need it. This + // avoids some work in common cases. + const Token *ArgTok = ActualArgs->getUnexpArgument(ArgNo); + if (ActualArgs->ArgNeedsPreexpansion(ArgTok, PP)) + ResultArgToks = &ActualArgs->getPreExpArgument(ArgNo, PP)[0]; + else + ResultArgToks = ArgTok; // Use non-preexpanded tokens. + + // If the arg token expanded into anything, append it. + if (ResultArgToks->isNot(tok::eof)) { + unsigned FirstResult = ResultToks.size(); + unsigned NumToks = MacroArgs::getArgLength(ResultArgToks); + ResultToks.append(ResultArgToks, ResultArgToks+NumToks); + + // If any tokens were substituted from the argument, the whitespace + // before the first token should match the whitespace of the arg + // identifier. + ResultToks[FirstResult].setFlagValue(Token::LeadingSpace, + CurTok.hasLeadingSpace() || + NextTokGetsSpace); + NextTokGetsSpace = false; + } else { + // If this is an empty argument, and if there was whitespace before the + // formal token, make sure the next token gets whitespace before it. + NextTokGetsSpace = CurTok.hasLeadingSpace(); + } + continue; + } + + // Okay, we have a token that is either the LHS or RHS of a paste (##) + // argument. It gets substituted as its non-pre-expanded tokens. + const Token *ArgToks = ActualArgs->getUnexpArgument(ArgNo); + unsigned NumToks = MacroArgs::getArgLength(ArgToks); + if (NumToks) { // Not an empty argument? + // If this is the GNU ", ## __VA_ARG__" extension, and we just learned + // that __VA_ARG__ expands to multiple tokens, avoid a pasting error when + // the expander trys to paste ',' with the first token of the __VA_ARG__ + // expansion. + if (PasteBefore && ResultToks.size() >= 2 && + ResultToks[ResultToks.size()-2].is(tok::comma) && + (unsigned)ArgNo == Macro->getNumArgs()-1 && + Macro->isVariadic()) { + // Remove the paste operator, report use of the extension. + PP.Diag(ResultToks.back().getLocation(), diag::ext_paste_comma); + ResultToks.pop_back(); + } + + ResultToks.append(ArgToks, ArgToks+NumToks); + + // If the next token was supposed to get leading whitespace, ensure it has + // it now. + if (NextTokGetsSpace) { + ResultToks[ResultToks.size()-NumToks].setFlag(Token::LeadingSpace); + NextTokGetsSpace = false; + } + continue; + } + + // If an empty argument is on the LHS or RHS of a paste, the standard (C99 + // 6.10.3.3p2,3) calls for a bunch of placemarker stuff to occur. We + // implement this by eating ## operators when a LHS or RHS expands to + // empty. + NextTokGetsSpace |= CurTok.hasLeadingSpace(); + if (PasteAfter) { + // Discard the argument token and skip (don't copy to the expansion + // buffer) the paste operator after it. + NextTokGetsSpace |= Tokens[i+1].hasLeadingSpace(); + ++i; + continue; + } + + // If this is on the RHS of a paste operator, we've already copied the + // paste operator to the ResultToks list. Remove it. + assert(PasteBefore && ResultToks.back().is(tok::hashhash)); + NextTokGetsSpace |= ResultToks.back().hasLeadingSpace(); + ResultToks.pop_back(); + + // If this is the __VA_ARGS__ token, and if the argument wasn't provided, + // and if the macro had at least one real argument, and if the token before + // the ## was a comma, remove the comma. + if ((unsigned)ArgNo == Macro->getNumArgs()-1 && // is __VA_ARGS__ + ActualArgs->isVarargsElidedUse() && // Argument elided. + !ResultToks.empty() && ResultToks.back().is(tok::comma)) { + // Never add a space, even if the comma, ##, or arg had a space. + NextTokGetsSpace = false; + // Remove the paste operator, report use of the extension. + PP.Diag(ResultToks.back().getLocation(), diag::ext_paste_comma); + ResultToks.pop_back(); + } + continue; + } + + // If anything changed, install this as the new Tokens list. + if (MadeChange) { + // This is deleted in the dtor. + NumTokens = ResultToks.size(); + Token *Res = new Token[ResultToks.size()]; + if (NumTokens) + memcpy(Res, &ResultToks[0], NumTokens*sizeof(Token)); + Tokens = Res; + OwnsTokens = true; + } +} + +/// Lex - Lex and return a token from this macro stream. +/// +void TokenLexer::Lex(Token &Tok) { + // Lexing off the end of the macro, pop this macro off the expansion stack. + if (isAtEnd()) { + // If this is a macro (not a token stream), mark the macro enabled now + // that it is no longer being expanded. + if (Macro) Macro->EnableMacro(); + + // Pop this context off the preprocessors lexer stack and get the next + // token. This will delete "this" so remember the PP instance var. + Preprocessor &PPCache = PP; + if (PP.HandleEndOfTokenLexer(Tok)) + return; + + // HandleEndOfTokenLexer may not return a token. If it doesn't, lex + // whatever is next. + return PPCache.Lex(Tok); + } + + // If this is the first token of the expanded result, we inherit spacing + // properties later. + bool isFirstToken = CurToken == 0; + + // Get the next token to return. + Tok = Tokens[CurToken++]; + + // If this token is followed by a token paste (##) operator, paste the tokens! + if (!isAtEnd() && Tokens[CurToken].is(tok::hashhash)) + if (PasteTokens(Tok)) { + // When handling the microsoft /##/ extension, the final token is + // returned by PasteTokens, not the pasted token. + return; + } + + // The token's current location indicate where the token was lexed from. We + // need this information to compute the spelling of the token, but any + // diagnostics for the expanded token should appear as if they came from + // InstantiationLoc. Pull this information together into a new SourceLocation + // that captures all of this. + if (InstantiateLoc.isValid()) { // Don't do this for token streams. + SourceManager &SrcMgr = PP.getSourceManager(); + Tok.setLocation(SrcMgr.getInstantiationLoc(Tok.getLocation(), + InstantiateLoc)); + } + + // If this is the first token, set the lexical properties of the token to + // match the lexical properties of the macro identifier. + if (isFirstToken) { + Tok.setFlagValue(Token::StartOfLine , AtStartOfLine); + Tok.setFlagValue(Token::LeadingSpace, HasLeadingSpace); + } + + // Handle recursive expansion! + if (Tok.getIdentifierInfo() && !DisableMacroExpansion) + return PP.HandleIdentifier(Tok); + + // Otherwise, return a normal token. +} + +/// PasteTokens - Tok is the LHS of a ## operator, and CurToken is the ## +/// operator. Read the ## and RHS, and paste the LHS/RHS together. If there +/// are is another ## after it, chomp it iteratively. Return the result as Tok. +/// If this returns true, the caller should immediately return the token. +bool TokenLexer::PasteTokens(Token &Tok) { + llvm::SmallVector<char, 128> Buffer; + do { + // Consume the ## operator. + SourceLocation PasteOpLoc = Tokens[CurToken].getLocation(); + ++CurToken; + assert(!isAtEnd() && "No token on the RHS of a paste operator!"); + + // Get the RHS token. + const Token &RHS = Tokens[CurToken]; + + bool isInvalid = false; + + // Allocate space for the result token. This is guaranteed to be enough for + // the two tokens and a null terminator. + Buffer.resize(Tok.getLength() + RHS.getLength() + 1); + + // Get the spelling of the LHS token in Buffer. + const char *BufPtr = &Buffer[0]; + unsigned LHSLen = PP.getSpelling(Tok, BufPtr); + if (BufPtr != &Buffer[0]) // Really, we want the chars in Buffer! + memcpy(&Buffer[0], BufPtr, LHSLen); + + BufPtr = &Buffer[LHSLen]; + unsigned RHSLen = PP.getSpelling(RHS, BufPtr); + if (BufPtr != &Buffer[LHSLen]) // Really, we want the chars in Buffer! + memcpy(&Buffer[LHSLen], BufPtr, RHSLen); + + // Add null terminator. + Buffer[LHSLen+RHSLen] = '\0'; + + // Trim excess space. + Buffer.resize(LHSLen+RHSLen+1); + + // Plop the pasted result (including the trailing newline and null) into a + // scratch buffer where we can lex it. + SourceLocation ResultTokLoc = PP.CreateString(&Buffer[0], Buffer.size()); + + // Lex the resultant pasted token into Result. + Token Result; + + // Avoid testing /*, as the lexer would think it is the start of a comment + // and emit an error that it is unterminated. + if (Tok.is(tok::slash) && RHS.is(tok::star)) { + isInvalid = true; + } else if (Tok.is(tok::identifier) && RHS.is(tok::identifier)) { + // Common paste case: identifier+identifier = identifier. Avoid creating + // a lexer and other overhead. + PP.IncrementPasteCounter(true); + Result.startToken(); + Result.setKind(tok::identifier); + Result.setLocation(ResultTokLoc); + Result.setLength(LHSLen+RHSLen); + } else { + PP.IncrementPasteCounter(false); + + // Make a lexer to lex this string from. + SourceManager &SourceMgr = PP.getSourceManager(); + const char *ResultStrData = SourceMgr.getCharacterData(ResultTokLoc); + + // Make a lexer object so that we lex and expand the paste result. + Lexer *TL = new Lexer(ResultTokLoc, PP, ResultStrData, + ResultStrData+LHSLen+RHSLen /*don't include null*/); + + // Lex a token in raw mode. This way it won't look up identifiers + // automatically, lexing off the end will return an eof token, and + // warnings are disabled. This returns true if the result token is the + // entire buffer. + bool IsComplete = TL->LexRawToken(Result); + + // If we got an EOF token, we didn't form even ONE token. For example, we + // did "/ ## /" to get "//". + IsComplete &= Result.isNot(tok::eof); + isInvalid = !IsComplete; + + // We're now done with the temporary lexer. + delete TL; + } + + // If pasting the two tokens didn't form a full new token, this is an error. + // This occurs with "x ## +" and other stuff. Return with Tok unmodified + // and with RHS as the next token to lex. + if (isInvalid) { + // Test for the Microsoft extension of /##/ turning into // here on the + // error path. + if (PP.getLangOptions().Microsoft && Tok.is(tok::slash) && + RHS.is(tok::slash)) { + HandleMicrosoftCommentPaste(Tok); + return true; + } else { + // TODO: If not in assembler language mode. + PP.Diag(PasteOpLoc, diag::err_pp_bad_paste, + std::string(Buffer.begin(), Buffer.end()-1)); + return false; + } + } + + // Turn ## into 'unknown' to avoid # ## # from looking like a paste + // operator. + if (Result.is(tok::hashhash)) + Result.setKind(tok::unknown); + // FIXME: Turn __VA_ARGS__ into "not a token"? + + // Transfer properties of the LHS over the the Result. + Result.setFlagValue(Token::StartOfLine , Tok.isAtStartOfLine()); + Result.setFlagValue(Token::LeadingSpace, Tok.hasLeadingSpace()); + + // Finally, replace LHS with the result, consume the RHS, and iterate. + ++CurToken; + Tok = Result; + } while (!isAtEnd() && Tokens[CurToken].is(tok::hashhash)); + + // Now that we got the result token, it will be subject to expansion. Since + // token pasting re-lexes the result token in raw mode, identifier information + // isn't looked up. As such, if the result is an identifier, look up id info. + if (Tok.is(tok::identifier)) { + // Look up the identifier info for the token. We disabled identifier lookup + // by saying we're skipping contents, so we need to do this manually. + Tok.setIdentifierInfo(PP.LookUpIdentifierInfo(Tok)); + } + return false; +} + +/// isNextTokenLParen - If the next token lexed will pop this macro off the +/// expansion stack, return 2. If the next unexpanded token is a '(', return +/// 1, otherwise return 0. +unsigned TokenLexer::isNextTokenLParen() const { + // Out of tokens? + if (isAtEnd()) + return 2; + return Tokens[CurToken].is(tok::l_paren); +} + + +/// HandleMicrosoftCommentPaste - In microsoft compatibility mode, /##/ pastes +/// together to form a comment that comments out everything in the current +/// macro, other active macros, and anything left on the current physical +/// source line of the instantiated buffer. Handle this by returning the +/// first token on the next line. +void TokenLexer::HandleMicrosoftCommentPaste(Token &Tok) { + // We 'comment out' the rest of this macro by just ignoring the rest of the + // tokens that have not been lexed yet, if any. + + // Since this must be a macro, mark the macro enabled now that it is no longer + // being expanded. + assert(Macro && "Token streams can't paste comments"); + Macro->EnableMacro(); + + PP.HandleMicrosoftCommentPaste(Tok); +} |