I have two functions, a collection of possible error codes, and a unit-testing framework.
The parsing of a character into its unary prefix and payload is handled by a few named functions and macros nlz, nlo, lsb and msb for counting leading zeros or ones and generating masks of ones on the right or left. The nlz function could be re-written for speed or (maybe) for clarity; the current version is optimized for size to help abstract it away from the code that uses it.
I want to make these conversion functions really nice so I can use them as the basis of a language interpreter that's Unicode-capable.
It isn't factored into a library/header yet, but that's the obvious next step (and would complicate the presentation, I think).
minunit.h:
/* file: minunit.h
cf.http://www.jera.com/techinfo/jtns/jtn002.html */
#define mu_assert(message, test) do { if (!(test)) return message; } while (0)
#define mu_run_test(test) do { char *message = test(); tests_run++; \
if (message) return message; } while (0)
extern int tests_run;
io.c:
//cf. http://www.ietf.org/rfc/rfc3629.txt p.3
#include<stdlib.h>
#include <stdio.h>
//#include <sys/bitops.h> // ilog2
#include <math.h> // log2
/*
<-------- adapters ("apps-"hungarian naming)
utf8 utf8(ucs4...)
ucs4 ucs4(utf8...)
*/
enum errinfo {
no_error = 0,
invalid_encoding = 1,
invalid_extended_encoding = 2,
buffer_alloc_fail = 4,
bad_following_character = 8,
over_length_encoding = 16,
code_point_out_of_range = 32,
};
int_least32_t *ucs4(char *str, int n, enum errinfo *errinfo);
char *utf8(int_least32_t *ar, int n, int *an, enum errinfo *errinfo);
/* number of leading zeros of byte-sized value */
static int nlz(uint_least32_t x){ return 7 - (x? floor(log2(x)): -1); }
/* number of leading ones of byte-sized value */
#define nlo(x) nlz(0xFF^(x))
/* generate unsigned mask of x lsb ones */
#define lsb(x) ((1U<<(x))-1)
/* generate byte mask of x msb ones */
#define msb(x) (0xFF^lsb(8-(x)))
int_least32_t *ucs4(char *str, int n, enum errinfo *errinfo){
unsigned char *p=str;
int_least32_t *u,*buf;
uint_least32_t x;
int pre;
int i,j;
if (errinfo) *errinfo=0;
buf=u=malloc(n*sizeof*u);
if (!buf)
if (errinfo) *errinfo |= buffer_alloc_fail;
if (buf)
for (i=0; i<n && *p; i++){
switch(pre=nlo(x=*p++)){
case 0: break;
case 1: if (errinfo) *errinfo |= invalid_encoding;
x=0xFFFD;
break;
case 2:
case 3:
case 4: x&=lsb(8-pre);
for (j=1; j<pre; j++){
if (nlo(*p)!=1)
if (errinfo) *errinfo |= bad_following_character;
x=(x<<6) | (*p++&lsb(6));
}
break;
default: if (errinfo) *errinfo |= invalid_extended_encoding;
x=0xFFFD;
break;
}
if (x < ((int[]){0,0,0x80,0x800,0x10000})[pre])
if (errinfo) *errinfo |= over_length_encoding;
*u++=x;
}
return buf;
}
char *utf8(int_least32_t *ar, int n, int *an, enum errinfo *errinfo){
int i;
int_least32_t x;
char *p,*buf=p=malloc((n+1)*4);
if (!buf) if (errinfo) *errinfo |= buffer_alloc_fail;
if (buf) {
for (i=0; i<n; i++){
x=ar[i];
if (x <= lsb(7))
*p++=x;
else if (x <= lsb(11))
*p++=msb(2)| (x>>6),
*p++=msb(1)| (x & lsb(6));
else if (x <= lsb(16))
*p++=msb(3)| (x>>12),
*p++=msb(1)| ((x>>6) & lsb(6)),
*p++=msb(1)| (x & lsb(6));
else if (x <= 0x10FFFF)
*p++=msb(4)| (x>>18),
*p++=msb(1)| ((x>>12) & lsb(6)),
*p++=msb(1)| ((x>>6) & lsb(6)),
*p++=msb(1)| (x & lsb(6));
else
if (errinfo) *errinfo |= code_point_out_of_range;
}
*p++=0;
}
return buf;
}
#include "minunit.h"
int tests_run = 0;
#define test_case(c) if(c)return #c;
static char *test_nlz(){
//int i;for(i=0;i<256;i++)printf("%d <%x>, nlz %d, nlz~ %d\n",i,i,nlz(i),nlz(i^0xFF));
test_case(nlz(0)!=8)
test_case(nlz(1)!=7)
test_case(nlz(2)!=6)
test_case(nlz(4)!=5)
test_case(nlz(8)!=4)
test_case(nlz(16)!=3)
test_case(nlz(32)!=2)
test_case(nlz(64)!=1)
test_case(nlz(128)!=0)
//test_case(2!="baloney")
return 0;
}
static char *test_utf8(){
test_case(strcmp("abc",utf8((int[]){97,98,99},3,NULL,NULL)))
return 0;
}
static char *test_ucs4(){
test_case(memcmp((int[]){97,98,99},ucs4("abc",3,NULL),3*sizeof(int)))
return 0;
}
static char *test_transit(){
test_case(strcmp("abc",utf8(ucs4("abc",3,NULL),3,NULL,NULL)))
test_case(memcmp((int[]){97,98,99},ucs4(utf8((int[]){97,98,99},3,NULL,NULL),3,NULL),3*sizeof(int)))
return 0;
}
static char *all_tests(){
mu_run_test(test_nlz);
mu_run_test(test_utf8);
mu_run_test(test_ucs4);
mu_run_test(test_transit);
return 0;
}
int main() {
char *result=all_tests();
if (result != 0) {
printf("%s\n",result);
} else {
printf("ALL TESTS PASSED\n");
}
printf("Tests run: %d\n", tests_run);
return result != 0;
return 0;
}
Note this code has greatly benefited from a previous review in comp.lang.c which inspired a similar review around the same time.