Add helpers to truncate UTF-8 at code points

This will help to avoid truncating a UTF-8 string in the middle of a
code point, producing an invalid UTF-8 result.
This commit is contained in:
Romain Vimont
2019-05-30 19:01:08 +02:00
parent 3aa5426cad
commit 0a7fe7ad57
6 changed files with 121 additions and 1 deletions

View File

@@ -126,6 +126,37 @@ static void test_xstrjoin_truncated_after_sep(void) {
assert(!strcmp("abc de ", s));
}
static void test_utf8_truncate(void) {
const char *s = "aÉbÔc";
assert(strlen(s) == 7); // É and Ô are 2 bytes-wide
size_t count;
count = utf8_truncation_index(s, 1);
assert(count == 1);
count = utf8_truncation_index(s, 2);
assert(count == 1); // É is 2 bytes-wide
count = utf8_truncation_index(s, 3);
assert(count == 3);
count = utf8_truncation_index(s, 4);
assert(count == 4);
count = utf8_truncation_index(s, 5);
assert(count == 4); // Ô is 2 bytes-wide
count = utf8_truncation_index(s, 6);
assert(count == 6);
count = utf8_truncation_index(s, 7);
assert(count == 7);
count = utf8_truncation_index(s, 8);
assert(count == 7); // no more chars
}
int main(void) {
test_xstrncpy_simple();
test_xstrncpy_just_fit();
@@ -135,5 +166,6 @@ int main(void) {
test_xstrjoin_truncated_in_token();
test_xstrjoin_truncated_before_sep();
test_xstrjoin_truncated_after_sep();
test_utf8_truncate();
return 0;
}