Add helpers to truncate UTF-8 at code points

This will help to avoid truncating a UTF-8 string in the middle of a code point, producing an invalid UTF-8 result.
2019-05-30 19:01:08 +02:00
parent 3aa5426cad
commit 0a7fe7ad57
6 changed files with 121 additions and 1 deletions
--- a/app/src/str_util.c
+++ b/app/src/str_util.c
@@ -58,6 +58,22 @@ strquote(const char *src) {
    return quoted;
 }

+size_t
+utf8_truncation_index(const char *utf8, size_t max_len) {
+    size_t len = strlen(utf8);
+    if (len <= max_len) {
+        return len;
+    }
+    len = max_len;
+    // see UTF-8 encoding <https://en.wikipedia.org/wiki/UTF-8#Description>
+    while ((utf8[len] & 0x80) != 0 && (utf8[len] & 0xc0) != 0xc0) {
+        // the next byte is not the start of a new UTF-8 codepoint
+        // so if we would cut there, the character would be truncated
+        len--;
+    }
+    return len;
+}
+
 #ifdef _WIN32

 wchar_t *