When checking the encoding of an 8-bit string literal, don't just check the

first codepoint! Also, don't reject empty raw string literals for spurious "encoding" issues. Also, don't rely on undefined behavior in ConvertUTF.c. llvm-svn: 152344
author: Richard Smith <richard-llvm@metafoo.co.uk> 2012-03-08 21:59:28 +0000
committer: Richard Smith <richard-llvm@metafoo.co.uk> 2012-03-08 21:59:28 +0000
commit: 812924502bb7fbe0525757576aa2d16072ab5a87 (patch)
tree: cf817c7931b543509af5f99d86be261a19e92b85 /clang/lib/Basic
parent: 0ef86b0ea3392c672dd3ce69e32aa6d3d33603dd (diff)
download: bcm5719-llvm-812924502bb7fbe0525757576aa2d16072ab5a87.tar.gz
bcm5719-llvm-812924502bb7fbe0525757576aa2d16072ab5a87.zip
1 files changed, 19 insertions, 3 deletions
diff --git a/clang/lib/Basic/ConvertUTF.c b/clang/lib/Basic/ConvertUTF.c
index b3fa9169344..e1970039e16 100644
--- a/clang/lib/Basic/ConvertUTF.c
+++ b/clang/lib/Basic/ConvertUTF.c
@@ -387,7 +387,7 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) {
  */
 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
     int length = trailingBytesForUTF8[*source]+1;
-    if (source+length > sourceEnd) {
+    if (length > sourceEnd - source) {
         return false;
     }
     return isLegalUTF8(source, length);
@@ -395,6 +395,22 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
 
 /* --------------------------------------------------------------------- */
 
+/*
+ * Exported function to return whether a UTF-8 string is legal or not.
+ * This is not used here; it's just exported.
+ */
+Boolean isLegalUTF8String(const UTF8 *source, const UTF8 *sourceEnd) {
+    while (source != sourceEnd) {
+        int length = trailingBytesForUTF8[*source] + 1;
+        if (length > sourceEnd - source || !isLegalUTF8(source, length))
+            return false;
+        source += length;
+    }
+    return true;
+}
+
+/* --------------------------------------------------------------------- */
+
 ConversionResult ConvertUTF8toUTF16 (
         const UTF8** sourceStart, const UTF8* sourceEnd, 
         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
@@ -404,7 +420,7 @@ ConversionResult ConvertUTF8toUTF16 (
     while (source < sourceEnd) {
         UTF32 ch = 0;
         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
-        if (source + extraBytesToRead >= sourceEnd) {
+        if (extraBytesToRead >= sourceEnd - source) {
             result = sourceExhausted; break;
         }
         /* Do this check whether lenient or strict */
@@ -477,7 +493,7 @@ ConversionResult ConvertUTF8toUTF32 (
     while (source < sourceEnd) {
         UTF32 ch = 0;
         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
-        if (source + extraBytesToRead >= sourceEnd) {
+        if (extraBytesToRead >= sourceEnd - source) {
             result = sourceExhausted; break;
         }
         /* Do this check whether lenient or strict */
author	Richard Smith <richard-llvm@metafoo.co.uk>	2012-03-08 21:59:28 +0000
committer	Richard Smith <richard-llvm@metafoo.co.uk>	2012-03-08 21:59:28 +0000
commit	812924502bb7fbe0525757576aa2d16072ab5a87 (patch)
tree	cf817c7931b543509af5f99d86be261a19e92b85 /clang/lib/Basic
parent	0ef86b0ea3392c672dd3ce69e32aa6d3d33603dd (diff)
download	bcm5719-llvm-812924502bb7fbe0525757576aa2d16072ab5a87.tar.gz bcm5719-llvm-812924502bb7fbe0525757576aa2d16072ab5a87.zip