From 3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Tue, 23 Sep 2014 14:46:06 +0800 Subject: lib/fold: Add support for multibyte strings Currently, the fold_text function doesn't understand multibyte strings, so may break a line in the middle of a multibyte sequence. This change adds multibyte-awareness to the fold code, and uses proper width calculations for the contents of the folded string. Signed-off-by: Jeremy Kerr --- lib/fold/fold.c | 70 +++++++++++++++++++++++++++++++++++++++++++--------- test/lib/test-fold.c | 54 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 109 insertions(+), 15 deletions(-) diff --git a/lib/fold/fold.c b/lib/fold/fold.c index ec10c8c..8bf133c 100644 --- a/lib/fold/fold.c +++ b/lib/fold/fold.c @@ -1,4 +1,12 @@ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include + #include "fold/fold.h" void fold_text(const char *text, @@ -7,38 +15,78 @@ void fold_text(const char *text, void *arg) { const char *start, *end, *sep; - int rc = 0; + size_t sep_bytes, len; + int col, rc = 0; + mbstate_t ps; + /* start, end and sep are byte-positions in the string, and should always + * lie on the start of a multibyte sequence */ start = end = sep = text; + sep_bytes = 0; + col = 0; + len = strlen(text); + memset(&ps, 0, sizeof(ps)); while (!rc) { + size_t bytes; + wchar_t wc; + int width; + + bytes = mbrtowc(&wc, end, len - (end - text), &ps); + + assert(bytes >= 0); + + /* we'll get a zero size for the nul terminator */ + if (!bytes) { + line_cb(arg, start, end - start); + break; + } - if (*end == '\n') { + if (wc == L'\n') { rc = line_cb(arg, start, end - start); - start = sep = ++end; + start = sep = end += bytes; + sep_bytes = 0; + col = 0; + continue; + } + + width = wcwidth(wc); - } else if (*end == '\0') { + /* we should have caught this in the !bytes check... */ + if (width == 0) { line_cb(arg, start, end - start); - rc = 1; + break; + } - } else if (end - start >= linelen - 1) { + /* unprintable character? just add it to the current line */ + if (width < 0) { + end += bytes; + continue; + } + + col += width; + + if (col > linelen) { if (sep != start) { /* split on a previous word boundary, if * possible */ rc = line_cb(arg, start, sep - start); - start = end = ++sep; + end = sep + sep_bytes; } else { /* otherwise, break the word */ - end++; rc = line_cb(arg, start, end - start); - start = sep = end; } + sep_bytes = 0; + start = sep = end; + col = 0; } else { - end++; /* record our last separator */ - if (*end == ' ') + if (wc == L' ') { sep = end; + sep_bytes = bytes; + } + end += bytes; } } } diff --git a/test/lib/test-fold.c b/test/lib/test-fold.c index 1f58fdf..474892d 100644 --- a/test/lib/test-fold.c +++ b/test/lib/test-fold.c @@ -1,7 +1,12 @@ +#define _GNU_SOURCE + #include #include #include +#include +#include +#include #include #include @@ -72,8 +77,19 @@ struct test test_break = { }, }; +struct test test_mbs = { + .in = "從主功能表畫面中,選取啟動選項。", + .linelen = 15, + .out = { + "從主功能表畫面", + "中,選取啟動選", + "項。", + NULL, + }, +}; + static struct test *tests[] = { - &test_split, &test_fold_line, &test_break, + &test_split, &test_fold_line, &test_break, &test_mbs, }; static void __attribute__((noreturn)) fail(struct ctx *ctx, @@ -83,7 +99,7 @@ static void __attribute__((noreturn)) fail(struct ctx *ctx, int i; fprintf(stderr, "%s\n", msg); - fprintf(stderr, "input:\n%s\n", test->in); + fprintf(stderr, "input, split at %d:\n%s\n", test->linelen, test->in); fprintf(stderr, "expected:\n"); for (i = 0; test->out[i]; i++) @@ -116,19 +132,39 @@ static void run_test(struct test *test) { struct line *line; struct ctx *ctx; - int i; + wchar_t *wcs; + int i, n; ctx = talloc(NULL, struct ctx); + n = strlen(test->in) + 1; list_init(&ctx->lines); fold_text(test->in, test->linelen, fold_line_cb, ctx); + i = 0; list_for_each_entry(&ctx->lines, line, list) { + size_t wcslen; + char *buf; + int width; + if (!test->out[i]) fail(ctx, test, "fold_text returned more lines than expected"); - if (line->len > test->linelen) + buf = talloc_strndup(ctx, line->buf, line->len); + wcslen = mbstowcs(NULL, buf, 0); + + if (wcslen == (size_t)-1) + fail(ctx, test, "invalid mutlibyte sequence"); + + wcs = talloc_array(ctx, wchar_t, wcslen + 1); + wcslen = mbstowcs(wcs, buf, n); + + width = wcswidth(wcs, wcslen); + if (width == -1) + fail(ctx, test, "nonprintable characters present"); + + if (width > (signed int)test->linelen) fail(ctx, test, "line too long"); if (line->len != strlen(test->out[i])) @@ -149,6 +185,16 @@ static void run_test(struct test *test) int main(void) { unsigned int i; + char *charset; + + setlocale(LC_CTYPE, ""); + + charset = nl_langinfo(CODESET); + if (strcmp(charset, "UTF-8")) { + fprintf(stderr, "Current charset is %s, tests require UTF-8\n", + charset); + return EXIT_FAILURE; + } for (i = 0; i < ARRAY_SIZE(tests); i++) run_test(tests[i]); -- cgit v1.2.1