Spelling

Source Link

edited Apr 6, 2021 at 13:17

Toby Speight

88.3k
14
104
326

Readability: The intendintent of every piece of code is clear to the reader.
Correctness: Everything is working as it should (I think it's clear what should happen).
Performance: Can anything be done to improve the performance of this code?

Included missing header

Source Link

edited Apr 6, 2021 at 11:55

KlemenPl

225
2
5

#include <cstdint>

// unsigned integer types
typedef uint64_t U64;
typedef uint32_t U32;
typedef uint16_t U16;
typedef uint8_t U8;

// signed integer types
typedef int64_t I64;
typedef int32_t I32;
typedef int16_t I16;
typedef int8_t I8;

U32 NextUTF8Char(const char* str, U32& idx)
{
    // https://en.wikipedia.org/wiki/UTF-8
    U8 c1 = (U8) str[idx];
    ++idx;

    U32 utf8c;

    if (((c1 >> 6) & 0b11) == 0b11)
    {
        // at least 2 bytes
        U8 c2 = (U8) str[idx];
        ++idx;
        if ((c1 >> 5) & 1)
        {
            // at least 3 bytes
            U8 c3 = (U8) str[idx];
            ++idx;

            if ((c1 >> 4) & 1)
            {
                // 4 bytes
                U8 c4 = (U8) str[idx];
                ++idx;

                utf8c = ((c4 & 0b00000111) << 18) | ((c3 & 0b00111111) << 12) |
                        ((c2 & 0b00111111) << 6) | (c1 & 0b00111111);
            } else
            {
                utf8c = ((c3 & 0b00001111) << 12) | ((c2 & 0b00111111) << 6) |
                        (c1 & 0b00111111);
            }
        } else
        {
            utf8c = ((c1 & 0b00011111) << 6) | (c2 & 0b00111111);
        }


    } else
    {
        utf8c = c1 & 0b01111111;
    }

    return utf8c;
}

// unsigned integer types
typedef uint64_t U64;
typedef uint32_t U32;
typedef uint16_t U16;
typedef uint8_t U8;

// signed integer types
typedef int64_t I64;
typedef int32_t I32;
typedef int16_t I16;
typedef int8_t I8;

U32 NextUTF8Char(const char* str, U32& idx)
{
    // https://en.wikipedia.org/wiki/UTF-8
    U8 c1 = (U8) str[idx];
    ++idx;

    U32 utf8c;

    if (((c1 >> 6) & 0b11) == 0b11)
    {
        // at least 2 bytes
        U8 c2 = (U8) str[idx];
        ++idx;
        if ((c1 >> 5) & 1)
        {
            // at least 3 bytes
            U8 c3 = (U8) str[idx];
            ++idx;

            if ((c1 >> 4) & 1)
            {
                // 4 bytes
                U8 c4 = (U8) str[idx];
                ++idx;

                utf8c = ((c4 & 0b00000111) << 18) | ((c3 & 0b00111111) << 12) |
                        ((c2 & 0b00111111) << 6) | (c1 & 0b00111111);
            } else
            {
                utf8c = ((c3 & 0b00001111) << 12) | ((c2 & 0b00111111) << 6) |
                        (c1 & 0b00111111);
            }
        } else
        {
            utf8c = ((c1 & 0b00011111) << 6) | (c2 & 0b00111111);
        }


    } else
    {
        utf8c = c1 & 0b01111111;
    }

    return utf8c;
}

#include <cstdint>

// unsigned integer types
typedef uint64_t U64;
typedef uint32_t U32;
typedef uint16_t U16;
typedef uint8_t U8;

// signed integer types
typedef int64_t I64;
typedef int32_t I32;
typedef int16_t I16;
typedef int8_t I8;

U32 NextUTF8Char(const char* str, U32& idx)
{
    // https://en.wikipedia.org/wiki/UTF-8
    U8 c1 = (U8) str[idx];
    ++idx;

    U32 utf8c;

    if (((c1 >> 6) & 0b11) == 0b11)
    {
        // at least 2 bytes
        U8 c2 = (U8) str[idx];
        ++idx;
        if ((c1 >> 5) & 1)
        {
            // at least 3 bytes
            U8 c3 = (U8) str[idx];
            ++idx;

            if ((c1 >> 4) & 1)
            {
                // 4 bytes
                U8 c4 = (U8) str[idx];
                ++idx;

                utf8c = ((c4 & 0b00000111) << 18) | ((c3 & 0b00111111) << 12) |
                        ((c2 & 0b00111111) << 6) | (c1 & 0b00111111);
            } else
            {
                utf8c = ((c3 & 0b00001111) << 12) | ((c2 & 0b00111111) << 6) |
                        (c1 & 0b00111111);
            }
        } else
        {
            utf8c = ((c1 & 0b00011111) << 6) | (c2 & 0b00111111);
        }


    } else
    {
        utf8c = c1 & 0b01111111;
    }

    return utf8c;
}

Source Link

asked Apr 6, 2021 at 6:08

KlemenPl

225
2
5

C++ UTF-8 decoder

While writing simple text rendering I found a lack of utf-8 decoders. Most decoders I found required allocating enough space for decoded string. In worse case that would mean that the decoded string would be four times as large as the original string.

I just needed to iterate over characters in a decoded format so I would be able to render them on the screen, so I wrote a simple function that would allow me to do that:

// unsigned integer types
typedef uint64_t U64;
typedef uint32_t U32;
typedef uint16_t U16;
typedef uint8_t U8;

// signed integer types
typedef int64_t I64;
typedef int32_t I32;
typedef int16_t I16;
typedef int8_t I8;

U32 NextUTF8Char(const char* str, U32& idx)
{
    // https://en.wikipedia.org/wiki/UTF-8
    U8 c1 = (U8) str[idx];
    ++idx;

    U32 utf8c;

    if (((c1 >> 6) & 0b11) == 0b11)
    {
        // at least 2 bytes
        U8 c2 = (U8) str[idx];
        ++idx;
        if ((c1 >> 5) & 1)
        {
            // at least 3 bytes
            U8 c3 = (U8) str[idx];
            ++idx;

            if ((c1 >> 4) & 1)
            {
                // 4 bytes
                U8 c4 = (U8) str[idx];
                ++idx;

                utf8c = ((c4 & 0b00000111) << 18) | ((c3 & 0b00111111) << 12) |
                        ((c2 & 0b00111111) << 6) | (c1 & 0b00111111);
            } else
            {
                utf8c = ((c3 & 0b00001111) << 12) | ((c2 & 0b00111111) << 6) |
                        (c1 & 0b00111111);
            }
        } else
        {
            utf8c = ((c1 & 0b00011111) << 6) | (c2 & 0b00111111);
        }


    } else
    {
        utf8c = c1 & 0b01111111;
    }

    return utf8c;
}

Usage:

const char* text = u8"ta suhi škafec pušča";
U32 idx = 0;
U32 c;
while ((c = NextUTF8Char(text, idx)) != 0)
{
    // c is our utf-8 character in unsigned int format
}

I'm currently mostly concerned about the following :

Readability: The intend of every piece of code is clear to the reader.
Correctness: Everything is working as it should (I think it's clear what should happen).
Performance: Can anything be done to improve the performance of this code?

c++utf-8

Stack Exchange Network

Return to Question

C++ UTF-8 decoder