diff options
author | Dmitri Gribenko <gribozavr@gmail.com> | 2013-01-30 14:29:28 +0000 |
---|---|---|
committer | Dmitri Gribenko <gribozavr@gmail.com> | 2013-01-30 14:29:28 +0000 |
commit | 28800da1b352a43175aaa8d224d7fe6895b014be (patch) | |
tree | 5949959451f50794d4331e29f5315d2a7b9d34fb /clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp | |
parent | 32832e61767f11d87f67fafa085f6860a9b3733f (diff) | |
download | bcm5719-llvm-28800da1b352a43175aaa8d224d7fe6895b014be.tar.gz bcm5719-llvm-28800da1b352a43175aaa8d224d7fe6895b014be.zip |
Comment parsing: resolve more named character references
This reimplements r173850 with a better approach:
(1) use a TableGen-generated matcher instead of doing a linear search;
(2) avoid allocations for new strings by converting code points to string
iterals with TableGen.
llvm-svn: 173931
Diffstat (limited to 'clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp')
-rw-r--r-- | clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp | 83 |
1 files changed, 83 insertions, 0 deletions
diff --git a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp new file mode 100644 index 00000000000..3afe2b73f0a --- /dev/null +++ b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp @@ -0,0 +1,83 @@ +//===--- ClangCommentHTMLNamedCharacterReferenceEmitter.cpp -----------------=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This tablegen backend emits an fficient function to translate HTML named +// character references to UTF-8 sequences. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/ConvertUTF.h" +#include "llvm/TableGen/Error.h" +#include "llvm/TableGen/Record.h" +#include "llvm/TableGen/StringMatcher.h" +#include <vector> + +using namespace llvm; + +/// \brief Convert a code point to the corresponding UTF-8 sequence represented +/// as a C string literal. +/// +/// \returns true on success. +static bool translateCodePointToUTF8(unsigned CodePoint, + SmallVectorImpl<char> &CLiteral) { + char Translated[UNI_MAX_UTF8_BYTES_PER_CODE_POINT]; + char *TranslatedPtr = Translated; + if (!ConvertCodePointToUTF8(CodePoint, TranslatedPtr)) + return false; + + StringRef UTF8(Translated, TranslatedPtr - Translated); + + raw_svector_ostream OS(CLiteral); + OS << "\""; + for (size_t i = 0, e = UTF8.size(); i != e; ++i) { + OS << "\\x"; + OS.write_hex(static_cast<unsigned char>(UTF8[i])); + } + OS << "\""; + + return true; +} + +namespace clang { +void EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records, + raw_ostream &OS) { + std::vector<Record *> Tags = Records.getAllDerivedDefinitions("NCR"); + std::vector<StringMatcher::StringPair> NameToUTF8; + SmallString<32> CLiteral; + for (std::vector<Record *>::iterator I = Tags.begin(), E = Tags.end(); + I != E; ++I) { + Record &Tag = **I; + std::string Spelling = Tag.getValueAsString("Spelling"); + uint64_t CodePoint = Tag.getValueAsInt("CodePoint"); + CLiteral.clear(); + CLiteral.append("return "); + if (!translateCodePointToUTF8(CodePoint, CLiteral)) { + SrcMgr.PrintMessage(Tag.getLoc().front(), + SourceMgr::DK_Error, + Twine("invalid code point")); + continue; + } + CLiteral.append(";"); + + StringMatcher::StringPair Match(Spelling, CLiteral.str()); + NameToUTF8.push_back(Match); + } + + OS << "// This file is generated by TableGen. Do not edit.\n\n"; + + OS << "StringRef translateHTMLNamedCharacterReferenceToUTF8(\n" + " StringRef Name) {\n"; + StringMatcher("Name", NameToUTF8, OS).Emit(); + OS << " return StringRef();\n" + << "}\n\n"; +} + +} // end namespace clang + |