lib/fold: Add support for multibyte strings

Currently, the fold_text function doesn't understand multibyte strings, so may break a line in the middle of a multibyte sequence. This change adds multibyte-awareness to the fold code, and uses proper width calculations for the contents of the folded string. Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
author: Jeremy Kerr <jk@ozlabs.org> 2014-09-23 14:46:06 +0800
committer: Samuel Mendoza-Jonas <sam.mj@au1.ibm.com> 2014-09-23 16:47:58 +1000
commit: 3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4 (patch)
tree: c1ff2d5ccb4ba0d5b0ef1af0f02bcad528ce7d5b
parent: 73ee21af6d0a379a104a21b7569331284b3659b7 (diff)
download: talos-petitboot-3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4.tar.gz
talos-petitboot-3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4.zip
2 files changed, 109 insertions, 15 deletions
diff --git a/lib/fold/fold.c b/lib/fold/fold.c
index ec10c8c..8bf133c 100644
--- a/lib/fold/fold.c
+++ b/lib/fold/fold.c
@@ -1,4 +1,12 @@
 
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+#include <wchar.h>
+#include <wctype.h>
+
 #include "fold/fold.h"
 
 void fold_text(const char *text,
@@ -7,38 +15,78 @@ void fold_text(const char *text,
 		void *arg)
 {
 	const char *start, *end, *sep;
-	int rc = 0;
+	size_t sep_bytes, len;
+	int col, rc = 0;
+	mbstate_t ps;
 
+	/* start, end and sep are byte-positions in the string, and should always
+	 * lie on the start of a multibyte sequence */
 	start = end = sep = text;
+	sep_bytes = 0;
+	col = 0;
+	len = strlen(text);
+	memset(&ps, 0, sizeof(ps));
 
 	while (!rc) {
+		size_t bytes;
+		wchar_t wc;
+		int width;
+
+		bytes = mbrtowc(&wc, end, len - (end - text), &ps);
+
+		assert(bytes >= 0);
+
+		/* we'll get a zero size for the nul terminator */
+		if (!bytes) {
+			line_cb(arg, start, end - start);
+			break;
+		}
 
-		if (*end == '\n') {
+		if (wc == L'\n') {
 			rc = line_cb(arg, start, end - start);
-			start = sep = ++end;
+			start = sep = end += bytes;
+			sep_bytes = 0;
+			col = 0;
+			continue;
+		}
+
+		width = wcwidth(wc);
 
-		} else if (*end == '\0') {
+		/* we should have caught this in the !bytes check... */
+		if (width == 0) {
 			line_cb(arg, start, end - start);
-			rc = 1;
+			break;
+		}
 
-		} else if (end - start >= linelen - 1) {
+		/* unprintable character? just add it to the current line */
+		if (width < 0) {
+			end += bytes;
+			continue;
+		}
+
+		col += width;
+
+		if (col > linelen) {
 			if (sep != start) {
 				/* split on a previous word boundary, if
 				 * possible */
 				rc = line_cb(arg, start, sep - start);
-				start = end = ++sep;
+				end = sep + sep_bytes;
 			} else {
 				/* otherwise, break the word */
-				end++;
 				rc = line_cb(arg, start, end - start);
-				start = sep = end;
 			}
+			sep_bytes = 0;
+			start = sep = end;
+			col = 0;
 
 		} else {
-			end++;
 			/* record our last separator */
-			if (*end == ' ')
+			if (wc == L' ') {
 				sep = end;
+				sep_bytes = bytes;
+			}
+			end += bytes;
 		}
 	}
 }
diff --git a/test/lib/test-fold.c b/test/lib/test-fold.c
index 1f58fdf..474892d 100644
--- a/test/lib/test-fold.c
+++ b/test/lib/test-fold.c
@@ -1,7 +1,12 @@
 
+#define _GNU_SOURCE
+
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
+#include <locale.h>
+#include <wchar.h>
+#include <langinfo.h>
 
 #include <fold/fold.h>
 #include <list/list.h>
@@ -72,8 +77,19 @@ struct test test_break = {
 	},
 };
 
+struct test test_mbs = {
+	.in = "從主功能表畫面中，選取啟動選項。",
+	.linelen = 15,
+	.out = {
+		"從主功能表畫面",
+		"中，選取啟動選",
+		"項。",
+		NULL,
+	},
+};
+
 static struct test *tests[] = {
-	&test_split, &test_fold_line, &test_break,
+	&test_split, &test_fold_line, &test_break, &test_mbs,
 };
 
 static void __attribute__((noreturn)) fail(struct ctx *ctx,
@@ -83,7 +99,7 @@ static void __attribute__((noreturn)) fail(struct ctx *ctx,
 	int i;
 
 	fprintf(stderr, "%s\n", msg);
-	fprintf(stderr, "input:\n%s\n", test->in);
+	fprintf(stderr, "input, split at %d:\n%s\n", test->linelen, test->in);
 
 	fprintf(stderr, "expected:\n");
 	for (i = 0; test->out[i]; i++)
@@ -116,19 +132,39 @@ static void run_test(struct test *test)
 {
 	struct line *line;
 	struct ctx *ctx;
-	int i;
+	wchar_t *wcs;
+	int i, n;
 
 	ctx = talloc(NULL, struct ctx);
+	n = strlen(test->in) + 1;
 	list_init(&ctx->lines);
 	fold_text(test->in, test->linelen, fold_line_cb, ctx);
 
+
 	i = 0;
 	list_for_each_entry(&ctx->lines, line, list) {
+		size_t wcslen;
+		char *buf;
+		int width;
+
 		if (!test->out[i])
 			fail(ctx, test,
 				"fold_text returned more lines than expected");
 
-		if (line->len > test->linelen)
+		buf = talloc_strndup(ctx, line->buf, line->len);
+		wcslen = mbstowcs(NULL, buf, 0);
+
+		if (wcslen == (size_t)-1)
+			fail(ctx, test, "invalid mutlibyte sequence");
+
+		wcs = talloc_array(ctx, wchar_t, wcslen + 1);
+		wcslen = mbstowcs(wcs, buf, n);
+
+		width = wcswidth(wcs, wcslen);
+		if (width == -1)
+			fail(ctx, test, "nonprintable characters present");
+
+		if (width > (signed int)test->linelen)
 			fail(ctx, test, "line too long");
 
 		if (line->len != strlen(test->out[i]))
@@ -149,6 +185,16 @@ static void run_test(struct test *test)
 int main(void)
 {
 	unsigned int i;
+	char *charset;
+
+	setlocale(LC_CTYPE, "");
+
+	charset = nl_langinfo(CODESET);
+	if (strcmp(charset, "UTF-8")) {
+		fprintf(stderr, "Current charset is %s, tests require UTF-8\n",
+				charset);
+		return EXIT_FAILURE;
+	}
 
 	for (i = 0; i < ARRAY_SIZE(tests); i++)
 		run_test(tests[i]);
author	Jeremy Kerr <jk@ozlabs.org>	2014-09-23 14:46:06 +0800
committer	Samuel Mendoza-Jonas <sam.mj@au1.ibm.com>	2014-09-23 16:47:58 +1000
commit	3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4 (patch)
tree	c1ff2d5ccb4ba0d5b0ef1af0f02bcad528ce7d5b
parent	73ee21af6d0a379a104a21b7569331284b3659b7 (diff)
download	talos-petitboot-3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4.tar.gz talos-petitboot-3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4.zip