I am trying to convert a tri-gram containing 3 characters of type wchar_t to an integer type (an uint64_t at the present time).
It's been quiet a while I've been working with shifting and squeezing bytes. Here's what I've come up with so far and it - at least the few test cases in the tri-gram list - seems to work so far. But actually I am not very confident with this code, also I have the feeling I may have complicated things more than needed. Code verbosity came out of experimenting.
In this version I am just taking 3-byte chars into account and there's not much of error checking yet. I should mention that portability is not much of a point, as this code will only run on Unix/Linux boxes. Endianess should be properly implemented then.
Please some suggestions or maybe more simple / reliable solution the one or other line.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
#include <locale.h>
#include <stdint.h>
#include <inttypes.h>
#include <math.h>
/**
 * Reconstruct trigram from uint64_t value
 */
void to_wchar(const uint64_t v, wchar_t *s)
{
    uint32_t v32_4 = (uint16_t) ((v) >> 16);
    uint32_t v32_3 = (uint16_t) ((v) >> 32);
    uint32_t v32_2 = (uint16_t) ((v) >> 48);
    uint32_t v32_1 = (uint16_t) ((v)  & 0x001FFFFF); //0x0000FFFFuL);
    s[3] = v32_1; //'\0';
    s[2] = v32_4;
    s[1] = v32_3;
    s[0] = v32_2;
}
/**
 * Convert byte array to uint64, storing 3 uint32_t values
 */
uint64_t to_uint64(const uint8_t *v)
{
    uint64_t ui64 = 
    /* wchar */
    ((int64_t) v[2]  << 56) |
    ((int64_t) v[3]  << 48) |
    /* wchar */
    ((int64_t) v[6]  << 40) |
    ((int64_t) v[7]  << 32) |
    /* wchar */
    ((int64_t) v[10] << 24) |
    ((int64_t) v[11] << 16) |
    /* unused */
    ((int64_t) 0 << 8)      | 
    (int64_t) 0;
    fprintf(stderr, "0x%" PRIx64 " (%ld)\n", ui64, ui64);
    return ui64;
}
int trigram_to_ui64(const wchar_t *v, uint64_t *ui64)
{
    uint8_t *z = NULL;
    int i;
    if ((WCHAR_MIN > 0) || (WCHAR_MAX < 0x10FFFF)) {
        fprintf(stderr, "Unprocessable rare UTF8 character. Found WCHAR_MIN=%d and WCHAR_MAX=%d", WCHAR_MIN, WCHAR_MAX);
        return 0;
    }
    z = malloc(3 * sizeof(wchar_t));
    memset(z, 0, sizeof(*z));
    uint32_t *ptr = (uint32_t *)z;
    for (i = 0; i < wcslen(v); i++) {
        wchar_t wc = v[i];
        /* Replace 4-byte and more chars, 0x007f = DEL */
        if (wc >= 0x10000)  
            wc = (count_bytes(wc) > 3 ? 0x007f : wc);
        /* Pack */
        uint8_t* data = (uint8_t*)ptr;
        data[0] = (uint8_t)((wc >> 24) & 0xff);
        data[1] = (uint8_t)((wc >> 16) & 0xff);
        data[2] = (uint8_t)((wc >> 8) & 0xff);
        data[3] = (uint8_t)(wc & 0xff);
        ptr++;
    }
    *ui64 = to_uint64(z);   
    free(z);
    return 1;
}
int ui64_to_trigram(const uint64_t ui64, wchar_t *trigram)
{
    to_wchar(ui64, trigram);
    return 1;
}
int main(int argc, char **argv)
{
    static const wchar_t *trigrams[] =
    {
        L"öx再",
        L"©¥c",
        L"再学习",
        L"AiB",
        L"a—c",
        L"екс",
        L"öbä"
        L"",
        L"A B",
        L" 学习",
        L"AB ",
        L"xy𝄞",
        NULL
    };
    int i;
    const wchar_t **str;
    uint64_t ui64;
    wchar_t aux[3] = {0};
    setlocale(LC_CTYPE, "");
    setlocale(LC_COLLATE, "");
    /* Basic testing ... */
    for (str = trigrams; *str; str++) {
            printf("==> '%ls'\n", *str);
            /* Convert to uint64_t */
            trigram_to_ui64(*str, &ui64);
            /* Convert back to wchar_t string */
            ui64_to_trigram(ui64, aux);
            printf("<== '%ls': ", aux);
            if (wcscmp(*str, aux) == 0)
                    printf("OK\n");
            else
                    printf("NOT OK\n");
    }
    return 0;
}
    
uint64_tinstead of assumingunsigned longwould do the trick. \$\endgroup\$unsigned long:-) \$\endgroup\$memset(z, 0, sizeof(*z));to do more thanz[0] = 0;? \$\endgroup\$