While writing simple text rendering I found a lack of utf-8 decoders. Most decoders I found required allocating enough space for decoded string. In worse case that would mean that the decoded string would be four times as large as the original string.
I just needed to iterate over characters in a decoded format so I would be able to render them on the screen, so I wrote a simple function that would allow me to do that:
// unsigned integer types
typedef uint64_t U64;
typedef uint32_t U32;
typedef uint16_t U16;
typedef uint8_t U8;
// signed integer types
typedef int64_t I64;
typedef int32_t I32;
typedef int16_t I16;
typedef int8_t I8;
U32 NextUTF8Char(const char* str, U32& idx)
{
    // https://en.wikipedia.org/wiki/UTF-8
    U8 c1 = (U8) str[idx];
    ++idx;
    U32 utf8c;
    if (((c1 >> 6) & 0b11) == 0b11)
    {
        // at least 2 bytes
        U8 c2 = (U8) str[idx];
        ++idx;
        if ((c1 >> 5) & 1)
        {
            // at least 3 bytes
            U8 c3 = (U8) str[idx];
            ++idx;
            if ((c1 >> 4) & 1)
            {
                // 4 bytes
                U8 c4 = (U8) str[idx];
                ++idx;
                utf8c = ((c4 & 0b00000111) << 18) | ((c3 & 0b00111111) << 12) |
                        ((c2 & 0b00111111) << 6) | (c1 & 0b00111111);
            } else
            {
                utf8c = ((c3 & 0b00001111) << 12) | ((c2 & 0b00111111) << 6) |
                        (c1 & 0b00111111);
            }
        } else
        {
            utf8c = ((c1 & 0b00011111) << 6) | (c2 & 0b00111111);
        }
    } else
    {
        utf8c = c1 & 0b01111111;
    }
    return utf8c;
}
Usage:
const char* text = u8"ta suhi škafec pušča";
U32 idx = 0;
U32 c;
while ((c = NextUTF8Char(text, idx)) != 0)
{
    // c is our utf-8 character in unsigned int format
}
I'm currently mostly concerned about the following :
- Readability: The intend of every piece of code is clear to the reader.
- Correctness: Everything is working as it should (I think it's clear what should happen).
- Performance: Can anything be done to improve the performance of this code?