lib/fold: Add support for multibyte strings

Currently, the fold_text function doesn't understand multibyte strings, so may break a line in the middle of a multibyte sequence. This change adds multibyte-awareness to the fold code, and uses proper width calculations for the contents of the folded string. Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
author: Jeremy Kerr <jk@ozlabs.org> 2014-09-23 14:46:06 +0800
committer: Samuel Mendoza-Jonas <sam.mj@au1.ibm.com> 2014-09-23 16:47:58 +1000
commit: 3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4 (patch)
tree: c1ff2d5ccb4ba0d5b0ef1af0f02bcad528ce7d5b /lib
parent: 73ee21af6d0a379a104a21b7569331284b3659b7 (diff)
download: talos-petitboot-3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4.tar.gz
talos-petitboot-3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4.zip
1 files changed, 59 insertions, 11 deletions
diff --git a/lib/fold/fold.c b/lib/fold/fold.c
index ec10c8c..8bf133c 100644
--- a/lib/fold/fold.c
+++ b/lib/fold/fold.c
@@ -1,4 +1,12 @@
 
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+#include <wchar.h>
+#include <wctype.h>
+
 #include "fold/fold.h"
 
 void fold_text(const char *text,
@@ -7,38 +15,78 @@ void fold_text(const char *text,
 		void *arg)
 {
 	const char *start, *end, *sep;
-	int rc = 0;
+	size_t sep_bytes, len;
+	int col, rc = 0;
+	mbstate_t ps;
 
+	/* start, end and sep are byte-positions in the string, and should always
+	 * lie on the start of a multibyte sequence */
 	start = end = sep = text;
+	sep_bytes = 0;
+	col = 0;
+	len = strlen(text);
+	memset(&ps, 0, sizeof(ps));
 
 	while (!rc) {
+		size_t bytes;
+		wchar_t wc;
+		int width;
+
+		bytes = mbrtowc(&wc, end, len - (end - text), &ps);
+
+		assert(bytes >= 0);
+
+		/* we'll get a zero size for the nul terminator */
+		if (!bytes) {
+			line_cb(arg, start, end - start);
+			break;
+		}
 
-		if (*end == '\n') {
+		if (wc == L'\n') {
 			rc = line_cb(arg, start, end - start);
-			start = sep = ++end;
+			start = sep = end += bytes;
+			sep_bytes = 0;
+			col = 0;
+			continue;
+		}
+
+		width = wcwidth(wc);
 
-		} else if (*end == '\0') {
+		/* we should have caught this in the !bytes check... */
+		if (width == 0) {
 			line_cb(arg, start, end - start);
-			rc = 1;
+			break;
+		}
 
-		} else if (end - start >= linelen - 1) {
+		/* unprintable character? just add it to the current line */
+		if (width < 0) {
+			end += bytes;
+			continue;
+		}
+
+		col += width;
+
+		if (col > linelen) {
 			if (sep != start) {
 				/* split on a previous word boundary, if
 				 * possible */
 				rc = line_cb(arg, start, sep - start);
-				start = end = ++sep;
+				end = sep + sep_bytes;
 			} else {
 				/* otherwise, break the word */
-				end++;
 				rc = line_cb(arg, start, end - start);
-				start = sep = end;
 			}
+			sep_bytes = 0;
+			start = sep = end;
+			col = 0;
 
 		} else {
-			end++;
 			/* record our last separator */
-			if (*end == ' ')
+			if (wc == L' ') {
 				sep = end;
+				sep_bytes = bytes;
+			}
+			end += bytes;
 		}
 	}
 }
author	Jeremy Kerr <jk@ozlabs.org>	2014-09-23 14:46:06 +0800
committer	Samuel Mendoza-Jonas <sam.mj@au1.ibm.com>	2014-09-23 16:47:58 +1000
commit	3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4 (patch)
tree	c1ff2d5ccb4ba0d5b0ef1af0f02bcad528ce7d5b /lib
parent	73ee21af6d0a379a104a21b7569331284b3659b7 (diff)
download	talos-petitboot-3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4.tar.gz talos-petitboot-3aef1b6d1f465596ebf7883a50efcf4d6f0ffcf4.zip