While writing simple text rendering I found a lack of utf-8 decoders. Most decoders I found required allocating enough space for decoded string. In worse case that would mean that the decoded string would be four times as large as the original string.
I just needed to iterate over characters in a decoded format so I would be able to render them on the screen, so I wrote a simple function that would allow me to do that:
// unsigned integer types
typedef uint64_t U64;
typedef uint32_t U32;
typedef uint16_t U16;
typedef uint8_t U8;
// signed integer types
typedef int64_t I64;
typedef int32_t I32;
typedef int16_t I16;
typedef int8_t I8;
U32 NextUTF8Char(const char* str, U32& idx)
{
// https://en.wikipedia.org/wiki/UTF-8
U8 c1 = (U8) str[idx];
++idx;
U32 utf8c;
if (((c1 >> 6) & 0b11) == 0b11)
{
// at least 2 bytes
U8 c2 = (U8) str[idx];
++idx;
if ((c1 >> 5) & 1)
{
// at least 3 bytes
U8 c3 = (U8) str[idx];
++idx;
if ((c1 >> 4) & 1)
{
// 4 bytes
U8 c4 = (U8) str[idx];
++idx;
utf8c = ((c4 & 0b00000111) << 18) | ((c3 & 0b00111111) << 12) |
((c2 & 0b00111111) << 6) | (c1 & 0b00111111);
} else
{
utf8c = ((c3 & 0b00001111) << 12) | ((c2 & 0b00111111) << 6) |
(c1 & 0b00111111);
}
} else
{
utf8c = ((c1 & 0b00011111) << 6) | (c2 & 0b00111111);
}
} else
{
utf8c = c1 & 0b01111111;
}
return utf8c;
}
Usage:
const char* text = u8"ta suhi škafec pušča";
U32 idx = 0;
U32 c;
while ((c = NextUTF8Char(text, idx)) != 0)
{
// c is our utf-8 character in unsigned int format
}
I'm currently mostly concerned about the following :
- Readability: The intend of every piece of code is clear to the reader.
- Correctness: Everything is working as it should (I think it's clear what should happen).
- Performance: Can anything be done to improve the performance of this code?