summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeremy Kerr <jk@ozlabs.org>2014-09-23 14:46:06 +0800
committerSamuel Mendoza-Jonas <sam.mj@au1.ibm.com>2014-09-23 16:47:58 +1000
commit3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4 (patch)
treec1ff2d5ccb4ba0d5b0ef1af0f02bcad528ce7d5b
parent73ee21af6d0a379a104a21b7569331284b3659b7 (diff)
downloadtalos-petitboot-3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4.tar.gz
talos-petitboot-3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4.zip
lib/fold: Add support for multibyte strings
Currently, the fold_text function doesn't understand multibyte strings, so may break a line in the middle of a multibyte sequence. This change adds multibyte-awareness to the fold code, and uses proper width calculations for the contents of the folded string. Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
-rw-r--r--lib/fold/fold.c70
-rw-r--r--test/lib/test-fold.c54
2 files changed, 109 insertions, 15 deletions
diff --git a/lib/fold/fold.c b/lib/fold/fold.c
index ec10c8c..8bf133c 100644
--- a/lib/fold/fold.c
+++ b/lib/fold/fold.c
@@ -1,4 +1,12 @@
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+#include <wchar.h>
+#include <wctype.h>
+
#include "fold/fold.h"
void fold_text(const char *text,
@@ -7,38 +15,78 @@ void fold_text(const char *text,
void *arg)
{
const char *start, *end, *sep;
- int rc = 0;
+ size_t sep_bytes, len;
+ int col, rc = 0;
+ mbstate_t ps;
+ /* start, end and sep are byte-positions in the string, and should always
+ * lie on the start of a multibyte sequence */
start = end = sep = text;
+ sep_bytes = 0;
+ col = 0;
+ len = strlen(text);
+ memset(&ps, 0, sizeof(ps));
while (!rc) {
+ size_t bytes;
+ wchar_t wc;
+ int width;
+
+ bytes = mbrtowc(&wc, end, len - (end - text), &ps);
+
+ assert(bytes >= 0);
+
+ /* we'll get a zero size for the nul terminator */
+ if (!bytes) {
+ line_cb(arg, start, end - start);
+ break;
+ }
- if (*end == '\n') {
+ if (wc == L'\n') {
rc = line_cb(arg, start, end - start);
- start = sep = ++end;
+ start = sep = end += bytes;
+ sep_bytes = 0;
+ col = 0;
+ continue;
+ }
+
+ width = wcwidth(wc);
- } else if (*end == '\0') {
+ /* we should have caught this in the !bytes check... */
+ if (width == 0) {
line_cb(arg, start, end - start);
- rc = 1;
+ break;
+ }
- } else if (end - start >= linelen - 1) {
+ /* unprintable character? just add it to the current line */
+ if (width < 0) {
+ end += bytes;
+ continue;
+ }
+
+ col += width;
+
+ if (col > linelen) {
if (sep != start) {
/* split on a previous word boundary, if
* possible */
rc = line_cb(arg, start, sep - start);
- start = end = ++sep;
+ end = sep + sep_bytes;
} else {
/* otherwise, break the word */
- end++;
rc = line_cb(arg, start, end - start);
- start = sep = end;
}
+ sep_bytes = 0;
+ start = sep = end;
+ col = 0;
} else {
- end++;
/* record our last separator */
- if (*end == ' ')
+ if (wc == L' ') {
sep = end;
+ sep_bytes = bytes;
+ }
+ end += bytes;
}
}
}
diff --git a/test/lib/test-fold.c b/test/lib/test-fold.c
index 1f58fdf..474892d 100644
--- a/test/lib/test-fold.c
+++ b/test/lib/test-fold.c
@@ -1,7 +1,12 @@
+#define _GNU_SOURCE
+
#include <stdlib.h>
#include <string.h>
#include <assert.h>
+#include <locale.h>
+#include <wchar.h>
+#include <langinfo.h>
#include <fold/fold.h>
#include <list/list.h>
@@ -72,8 +77,19 @@ struct test test_break = {
},
};
+struct test test_mbs = {
+ .in = "從主功能表畫面中,選取啟動選項。",
+ .linelen = 15,
+ .out = {
+ "從主功能表畫面",
+ "中,選取啟動選",
+ "項。",
+ NULL,
+ },
+};
+
static struct test *tests[] = {
- &test_split, &test_fold_line, &test_break,
+ &test_split, &test_fold_line, &test_break, &test_mbs,
};
static void __attribute__((noreturn)) fail(struct ctx *ctx,
@@ -83,7 +99,7 @@ static void __attribute__((noreturn)) fail(struct ctx *ctx,
int i;
fprintf(stderr, "%s\n", msg);
- fprintf(stderr, "input:\n%s\n", test->in);
+ fprintf(stderr, "input, split at %d:\n%s\n", test->linelen, test->in);
fprintf(stderr, "expected:\n");
for (i = 0; test->out[i]; i++)
@@ -116,19 +132,39 @@ static void run_test(struct test *test)
{
struct line *line;
struct ctx *ctx;
- int i;
+ wchar_t *wcs;
+ int i, n;
ctx = talloc(NULL, struct ctx);
+ n = strlen(test->in) + 1;
list_init(&ctx->lines);
fold_text(test->in, test->linelen, fold_line_cb, ctx);
+
i = 0;
list_for_each_entry(&ctx->lines, line, list) {
+ size_t wcslen;
+ char *buf;
+ int width;
+
if (!test->out[i])
fail(ctx, test,
"fold_text returned more lines than expected");
- if (line->len > test->linelen)
+ buf = talloc_strndup(ctx, line->buf, line->len);
+ wcslen = mbstowcs(NULL, buf, 0);
+
+ if (wcslen == (size_t)-1)
+ fail(ctx, test, "invalid mutlibyte sequence");
+
+ wcs = talloc_array(ctx, wchar_t, wcslen + 1);
+ wcslen = mbstowcs(wcs, buf, n);
+
+ width = wcswidth(wcs, wcslen);
+ if (width == -1)
+ fail(ctx, test, "nonprintable characters present");
+
+ if (width > (signed int)test->linelen)
fail(ctx, test, "line too long");
if (line->len != strlen(test->out[i]))
@@ -149,6 +185,16 @@ static void run_test(struct test *test)
int main(void)
{
unsigned int i;
+ char *charset;
+
+ setlocale(LC_CTYPE, "");
+
+ charset = nl_langinfo(CODESET);
+ if (strcmp(charset, "UTF-8")) {
+ fprintf(stderr, "Current charset is %s, tests require UTF-8\n",
+ charset);
+ return EXIT_FAILURE;
+ }
for (i = 0; i < ARRAY_SIZE(tests); i++)
run_test(tests[i]);
OpenPOWER on IntegriCloud