Became Hot Network Question

occurred Apr 18 at 15:18

Add forgotten explanation about restoring LUT string length in uses where candidate strings are random.

Source Link

edited Apr 18 at 7:55

user272752

// SeekHash:
//  Permute a bunch of operations looking for a combination
//  that results in no collisions
//  when applied to a small lexicon of strings.
//  Report those hashing operations and the "string LUT" for each.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>

static char *mkCodeStr( int ph[], int nHashes ) {
    // Given a sparsean array of small value integers and the useful element count,
    // make a string containing 'A', 'B', 'C', ... each at meaningful offsets.
    // Set unimportant bytes to '@' to achieve internal padding.
    // String used as LUT; maximum len is 32 (to suit SMALL lexicon only)
    // Yes, only 26 letters in alphabet, but...
    static char buf[32 + 1];

    memset( buf, '@', 32 ); // init "background"
    int maxHash = 0;
    for( int i = 0; i < nHashes; i++ ) {
        buf[ ph[i] ] = 'A' + i; // assign alpha representative into applicable element
        maxHash = ph[i] > maxHash ? ph[i] : maxHash; // remember max element assigned
    }
    buf[maxHash+1] = '\0'; // trim excess string length
    return buf;
}

typedef char (*fnc_t)( char );
// There are many 'trivial' bit operations possible to 'alter' a single byte/char.
static char bit2( char x ) { return (x>>1)&0x01; }
static char add1( char x ) { return x+1; }
static char sub1( char x ) { return x-1; }
static char invr( char x ) { return ~x; }
static char shR1( char x ) { return x>>1; }
static char shR2( char x ) { return x>>2; }
static char shL1( char x ) { return x<<1; }
static char shL2( char x ) { return x<<2; }
static char same( char x ) { return x; }
static char null( char x ) { return 0; }

typedef int (*op_t)( char, char, char );
// There are many operations to combine 3 bytes. These are a few of them.
static int sum_sum( char x, char y, char z ) { return x + y + z; }
static int xor_sum( char x, char y, char z ) { return x ^ y + z; }
static int sum_xor( char x, char y, char z ) { return x + y ^ z; }
static int lft_fld( char x, char y, char z ) { return sub1(x) + y ^ z; }

#define ENTRY(x) { #x, x, } // String-ize and the function address (pointer)

void seekHash3chars( char *wordList[], int nWords ) {
    // Each of leftmost 3 chars of every word will be subjected to a bit manipulation function.

    // 3 funcPtrs and an array of the "byte's bit mangling" operations available
    struct {
        char *name;
        fnc_t fnc;
    }  *f0, *f1, *f2, fncs[] = {
        ENTRY( null ),
        ENTRY( same ),
        ENTRY( shL1 ),
        ENTRY( shL2 ),
        ENTRY( shR1 ),
        ENTRY( shR2 ),
        ENTRY( invr ),
        ENTRY( sub1 ),
        ENTRY( add1 ),
        ENTRY( bit2 ),
    };
    const int nFncs = sizeof fncs / sizeof fncs[0];

    // List of "byte combining" operations (in same style as above)
    struct {
        char *name;
        op_t fnc;
    }  *op, ops[] = {
        ENTRY( sum_sum ),
        ENTRY( sum_xor ),
        ENTRY( xor_sum ),
        ENTRY( lft_fld ),
    };
    const int nOps = sizeof ops / sizeof ops[0];

    int hashes[32]; // Up to 32 distinct hash values, one for each lexicon entry
    unsigned int results = 0; // Up to 32 bit flags set as result of hash (to quickly detect/reject collisions)
    bool foundSome = false; // flag remains clear if no 4 bit operations work. Try 5 bits...

    for( int mask = 0xF; !foundSome && mask <= 0x1F; mask |= mask<<1 ) {
        // Try hashes between 0-15, then try larger hashes 0-3231

        for( op = ops; op < &ops[nOps]; op++ ) {
            // Try each of the "byte combining" operations

            for( f0 = fncs; f0 < &fncs[nFncs]; f0++ ) {
                // Use each byte mangle on 1st character

                for( f1 = fncs; f1 < &fncs[nFncs]; f1++ ) {
                    // Use each byte mangle on 2nd character

                    for( f2 = fncs; f2 < &fncs[nFncs]; f2++ ) {
                        // Use each byte mangle on 3rd character

                        results = 0;
                        for( int w = 0; w < nWords; w++ ) {
                            // Go thru entire lexicon (eg 12 month names)
                            // Mangle first 3 characters
                            char x = f0->fnc(wordList[w][0] &0x1F ); // low 5 bits of each char
                            char y = f1->fnc(wordList[w][1] &0x1F );
                            char z = f2->fnc(wordList[w][2] &0x1F );

                            int possHash = op->fnc( x, y, z ) & mask; // Combine 3 results and mask to 4/5 bits

                            hashes[w] = possHash; // remember this hash value of this lexicon "word"

                            // If collision detected, stop trying immediately and try something else
                            if( (0x01 << possHash) & results )
                                break;

                            results |= (0x01 << possHash); // mark as used and continue with more words
                        }

                        if( w >= nWords ) {
                            // WOW! This combination of operations on this lexicon results in NO collisions!!
                            // Report operations, some view of results and statistics,
                            // then formulate LUT as a string of '@' and 'A' to 'Z-ish'
                            printf( "%s (%s %s %s) hashes -", op->name, f0->name, f1->name, f2->name );
                            int min = hashes[0], max = hashes[0];
                            for( w = 0; w < nWords; w++ ) {

                                printf( " %2d", hashes[w] );
                                if( min > hashes[w] ) min = hashes[w];
                                if( max < hashes[w] ) max = hashes[w];
                            }
                            printf( "  min(%d) max(%d) range(%d) '%s'\n", min, max, max - min + 1, mkCodeStr( hashes, w ) );
                            foundSome = true;
                        }
                    }
                }
            }
        }
    }
}

static void demo( char *lst[], int num, int (*fnc)(char*) ) {
    // Output lexicon strings and each derived 'index' in two columns
    for( int i = 0; i < num ; i++ )
        printf( "%11s - %02d  #  %-5s - %02d\n", lst[ i ], fnc( lst[i] ), lst[ i+num ], fnc( lst[i+num] ) );
    printf( "\n" );
}


static int monthOrd( char *mn ) { return "DIE@CB@LJF@HAG@K"[ mn[1]/4&7 ^ mn[2]*2 &0xF ] &0xF; }

static void demo_MonthOrd( void ) {
// Convert a month name (or 3 letter abbrev.) to index (1-12). Beware false positives!
    char *ms[] = { 
        "January", "February", "March", "April", "May", "June", "July", "August", "Sept", "October", "Nov", "December",
        "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC",
    };
    const int num = sizeof(ms)/sizeof(ms[0]);

    demo( ms, num/2, monthOrd );

//  seekHash3chars( ms, num/2 ); // disabled for CR posting...
}


static int wkdayOrd( char cp[] ) { return "65013427"[*cp/2 + ~cp[1] & 0x7] & 0x7; }

static void demo_WkDayOrd( void ) {
// Convert a day name (or 3 letter abbrev.) to index (1-7, Sun=1). Beware false positives!
    char *ds[] = {
        "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday",
        "SU", "MO", "TU", "WE", "TH", "FR", "SA"
    };
    const int num = sizeof(ds)/sizeof(ds[0]);

    demo( ds, num/2, wkdayOrd );

    // Show searching for candidate operation combinations
    puts( "\n\nPress enter for demo of search in action" );  getchar();
    seekHash3chars( ds, num/2 ); // enabled for CR posting...
}



int main( void ) {
    printf( "demo_MonthOrd()\n" );  demo_MonthOrd();

    puts( "\n\nPress enter for demo of weekday lexicon" ); getchar();

    printf( "demo_WkDayOrd()\n" );  demo_WkDayOrd();

    return 0;
}

NB: In this version of the code, mkCodeStr() trims the LUT string to a minimum length, effectively discarding contiguous '@' characters to the right of the highest valid mapping alpha character. This works fine in a scenario in which the candidate being hashed and mapped is definitely a member of the lexicon (but which member?). For instance, when dealing with tokens like month names that appear at a known location in a row of a computer generated table of records. No "caller verification" would be required.

In uses wherein the candidate token could be any random word, its hash could result in a value (0-15 or 0-31), this truncation of the LUT string must be manually undone before use.

Example: this program may propose a 'trimmed' LUT string like "@@BADCE@FG" (10 chars). This would be fine if the candidates were guaranteed to be one of the lexicon's words. For dealing with any 'random' word, the LUT string needs to be padded on the right "@@BADCE@FG@@@@@@" (16 chars) as the candidate string has potential, after bit masking operations, to lead to any hash value from 0 to 15.

// SeekHash:
//  Permute a bunch of operations looking for a combination
//  that results in no collisions
//  when applied to a small lexicon of strings.
//  Report those hashing operations and the "string LUT" for each.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>

static char *mkCodeStr( int ph[], int nHashes ) {
    // Given a sparse array of small value integers and the useful element count,
    // make a string containing 'A', 'B', 'C', ... each at meaningful offsets.
    // Set unimportant bytes to '@' to achieve internal padding.
    // String used as LUT; maximum len is 32 (to suit SMALL lexicon only)
    // Yes, only 26 letters in alphabet, but...
    static char buf[32 + 1];

    memset( buf, '@', 32 ); // init "background"
    int maxHash = 0;
    for( int i = 0; i < nHashes; i++ ) {
        buf[ ph[i] ] = 'A' + i; // assign alpha representative into applicable element
        maxHash = ph[i] > maxHash ? ph[i] : maxHash; // remember max element assigned
    }
    buf[maxHash+1] = '\0'; // trim excess string length
    return buf;
}

typedef char (*fnc_t)( char );
// There are many 'trivial' bit operations possible to 'alter' a single byte/char.
static char bit2( char x ) { return (x>>1)&0x01; }
static char add1( char x ) { return x+1; }
static char sub1( char x ) { return x-1; }
static char invr( char x ) { return ~x; }
static char shR1( char x ) { return x>>1; }
static char shR2( char x ) { return x>>2; }
static char shL1( char x ) { return x<<1; }
static char shL2( char x ) { return x<<2; }
static char same( char x ) { return x; }
static char null( char x ) { return 0; }

typedef int (*op_t)( char, char, char );
// There are many operations to combine 3 bytes. These are a few of them.
static int sum_sum( char x, char y, char z ) { return x + y + z; }
static int xor_sum( char x, char y, char z ) { return x ^ y + z; }
static int sum_xor( char x, char y, char z ) { return x + y ^ z; }
static int lft_fld( char x, char y, char z ) { return sub1(x) + y ^ z; }

#define ENTRY(x) { #x, x, } // String-ize and the function address (pointer)

void seekHash3chars( char *wordList[], int nWords ) {
    // Each of leftmost 3 chars of every word will be subjected to a bit manipulation function.

    // 3 funcPtrs and an array of the "byte's bit mangling" operations available
    struct {
        char *name;
        fnc_t fnc;
    }  *f0, *f1, *f2, fncs[] = {
        ENTRY( null ),
        ENTRY( same ),
        ENTRY( shL1 ),
        ENTRY( shL2 ),
        ENTRY( shR1 ),
        ENTRY( shR2 ),
        ENTRY( invr ),
        ENTRY( sub1 ),
        ENTRY( add1 ),
        ENTRY( bit2 ),
    };
    const int nFncs = sizeof fncs / sizeof fncs[0];

    // List of "byte combining" operations (in same style as above)
    struct {
        char *name;
        op_t fnc;
    }  *op, ops[] = {
        ENTRY( sum_sum ),
        ENTRY( sum_xor ),
        ENTRY( xor_sum ),
        ENTRY( lft_fld ),
    };
    const int nOps = sizeof ops / sizeof ops[0];

    int hashes[32]; // Up to 32 distinct hash values, one for each lexicon entry
    unsigned int results = 0; // Up to 32 bit flags set as result of hash (to quickly detect/reject collisions)
    bool foundSome = false; // flag remains clear if no 4 bit operations work. Try 5 bits...

    for( int mask = 0xF; !foundSome && mask <= 0x1F; mask |= mask<<1 ) {
        // Try hashes between 0-15, then try larger hashes 0-32

        for( op = ops; op < &ops[nOps]; op++ ) {
            // Try each of the "byte combining" operations

            for( f0 = fncs; f0 < &fncs[nFncs]; f0++ ) {
                // Use each byte mangle on 1st character

                for( f1 = fncs; f1 < &fncs[nFncs]; f1++ ) {
                    // Use each byte mangle on 2nd character

                    for( f2 = fncs; f2 < &fncs[nFncs]; f2++ ) {
                        // Use each byte mangle on 3rd character

                        results = 0;
                        for( int w = 0; w < nWords; w++ ) {
                            // Go thru entire lexicon (eg 12 month names)
                            // Mangle first 3 characters
                            char x = f0->fnc(wordList[w][0] &0x1F ); // low 5 bits of each char
                            char y = f1->fnc(wordList[w][1] &0x1F );
                            char z = f2->fnc(wordList[w][2] &0x1F );

                            int possHash = op->fnc( x, y, z ) & mask; // Combine 3 results and mask to 4/5 bits

                            hashes[w] = possHash; // remember this hash value of this lexicon "word"

                            // If collision detected, stop trying immediately and try something else
                            if( (0x01 << possHash) & results )
                                break;

                            results |= (0x01 << possHash); // mark as used and continue with more words
                        }

                        if( w >= nWords ) {
                            // WOW! This combination of operations on this lexicon results in NO collisions!!
                            // Report operations, some view of results and statistics,
                            // then formulate LUT as a string of '@' and 'A' to 'Z-ish'
                            printf( "%s (%s %s %s) hashes -", op->name, f0->name, f1->name, f2->name );
                            int min = hashes[0], max = hashes[0];
                            for( w = 0; w < nWords; w++ ) {

                                printf( " %2d", hashes[w] );
                                if( min > hashes[w] ) min = hashes[w];
                                if( max < hashes[w] ) max = hashes[w];
                            }
                            printf( "  min(%d) max(%d) range(%d) '%s'\n", min, max, max - min + 1, mkCodeStr( hashes, w ) );
                            foundSome = true;
                        }
                    }
                }
            }
        }
    }
}

static void demo( char *lst[], int num, int (*fnc)(char*) ) {
    // Output lexicon strings and each derived 'index' in two columns
    for( int i = 0; i < num ; i++ )
        printf( "%11s - %02d  #  %-5s - %02d\n", lst[ i ], fnc( lst[i] ), lst[ i+num ], fnc( lst[i+num] ) );
    printf( "\n" );
}


static int monthOrd( char *mn ) { return "DIE@CB@LJF@HAG@K"[ mn[1]/4&7 ^ mn[2]*2 &0xF ] &0xF; }

static void demo_MonthOrd( void ) {
// Convert a month name (or 3 letter abbrev.) to index (1-12). Beware false positives!
    char *ms[] = { 
        "January", "February", "March", "April", "May", "June", "July", "August", "Sept", "October", "Nov", "December",
        "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC",
    };
    const int num = sizeof(ms)/sizeof(ms[0]);

    demo( ms, num/2, monthOrd );

//  seekHash3chars( ms, num/2 ); // disabled for CR posting...
}


static int wkdayOrd( char cp[] ) { return "65013427"[*cp/2 + ~cp[1] & 0x7] & 0x7; }

static void demo_WkDayOrd( void ) {
// Convert a day name (or 3 letter abbrev.) to index (1-7, Sun=1). Beware false positives!
    char *ds[] = {
        "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday",
        "SU", "MO", "TU", "WE", "TH", "FR", "SA"
    };
    const int num = sizeof(ds)/sizeof(ds[0]);

    demo( ds, num/2, wkdayOrd );

    // Show searching for candidate operation combinations
    puts( "\n\nPress enter for demo of search in action" );  getchar();
    seekHash3chars( ds, num/2 ); // enabled for CR posting...
}



int main( void ) {
    printf( "demo_MonthOrd()\n" );  demo_MonthOrd();

    puts( "\n\nPress enter for demo of weekday lexicon" ); getchar();

    printf( "demo_WkDayOrd()\n" );  demo_WkDayOrd();

    return 0;
}

// SeekHash:
//  Permute a bunch of operations looking for a combination
//  that results in no collisions
//  when applied to a small lexicon of strings.
//  Report those hashing operations and the "string LUT" for each.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>

static char *mkCodeStr( int ph[], int nHashes ) {
    // Given an array of small value integers and the useful element count,
    // make a string containing 'A', 'B', 'C', ... each at meaningful offsets.
    // Set unimportant bytes to '@' to achieve internal padding.
    // String used as LUT; maximum len is 32 (to suit SMALL lexicon only)
    // Yes, only 26 letters in alphabet, but...
    static char buf[32 + 1];

    memset( buf, '@', 32 ); // init "background"
    int maxHash = 0;
    for( int i = 0; i < nHashes; i++ ) {
        buf[ ph[i] ] = 'A' + i; // assign alpha representative into applicable element
        maxHash = ph[i] > maxHash ? ph[i] : maxHash; // remember max element assigned
    }
    buf[maxHash+1] = '\0'; // trim excess string length
    return buf;
}

typedef char (*fnc_t)( char );
// There are many 'trivial' bit operations possible to 'alter' a single byte/char.
static char bit2( char x ) { return (x>>1)&0x01; }
static char add1( char x ) { return x+1; }
static char sub1( char x ) { return x-1; }
static char invr( char x ) { return ~x; }
static char shR1( char x ) { return x>>1; }
static char shR2( char x ) { return x>>2; }
static char shL1( char x ) { return x<<1; }
static char shL2( char x ) { return x<<2; }
static char same( char x ) { return x; }
static char null( char x ) { return 0; }

typedef int (*op_t)( char, char, char );
// There are many operations to combine 3 bytes. These are a few of them.
static int sum_sum( char x, char y, char z ) { return x + y + z; }
static int xor_sum( char x, char y, char z ) { return x ^ y + z; }
static int sum_xor( char x, char y, char z ) { return x + y ^ z; }
static int lft_fld( char x, char y, char z ) { return sub1(x) + y ^ z; }

#define ENTRY(x) { #x, x, } // String-ize and the function address (pointer)

void seekHash3chars( char *wordList[], int nWords ) {
    // Each of leftmost 3 chars of every word will be subjected to a bit manipulation function.

    // 3 funcPtrs and an array of the "byte's bit mangling" operations available
    struct {
        char *name;
        fnc_t fnc;
    }  *f0, *f1, *f2, fncs[] = {
        ENTRY( null ),
        ENTRY( same ),
        ENTRY( shL1 ),
        ENTRY( shL2 ),
        ENTRY( shR1 ),
        ENTRY( shR2 ),
        ENTRY( invr ),
        ENTRY( sub1 ),
        ENTRY( add1 ),
        ENTRY( bit2 ),
    };
    const int nFncs = sizeof fncs / sizeof fncs[0];

    // List of "byte combining" operations (in same style as above)
    struct {
        char *name;
        op_t fnc;
    }  *op, ops[] = {
        ENTRY( sum_sum ),
        ENTRY( sum_xor ),
        ENTRY( xor_sum ),
        ENTRY( lft_fld ),
    };
    const int nOps = sizeof ops / sizeof ops[0];

    int hashes[32]; // Up to 32 distinct hash values, one for each lexicon entry
    unsigned int results = 0; // Up to 32 bit flags set as result of hash (to quickly detect/reject collisions)
    bool foundSome = false; // flag remains clear if no 4 bit operations work. Try 5 bits...

    for( int mask = 0xF; !foundSome && mask <= 0x1F; mask |= mask<<1 ) {
        // Try hashes between 0-15, then try larger hashes 0-31

        for( op = ops; op < &ops[nOps]; op++ ) {
            // Try each of the "byte combining" operations

            for( f0 = fncs; f0 < &fncs[nFncs]; f0++ ) {
                // Use each byte mangle on 1st character

                for( f1 = fncs; f1 < &fncs[nFncs]; f1++ ) {
                    // Use each byte mangle on 2nd character

                    for( f2 = fncs; f2 < &fncs[nFncs]; f2++ ) {
                        // Use each byte mangle on 3rd character

                        results = 0;
                        for( int w = 0; w < nWords; w++ ) {
                            // Go thru entire lexicon (eg 12 month names)
                            // Mangle first 3 characters
                            char x = f0->fnc(wordList[w][0] &0x1F ); // low 5 bits of each char
                            char y = f1->fnc(wordList[w][1] &0x1F );
                            char z = f2->fnc(wordList[w][2] &0x1F );

                            int possHash = op->fnc( x, y, z ) & mask; // Combine 3 results and mask to 4/5 bits

                            hashes[w] = possHash; // remember this hash value of this lexicon "word"

                            // If collision detected, stop trying immediately and try something else
                            if( (0x01 << possHash) & results )
                                break;

                            results |= (0x01 << possHash); // mark as used and continue with more words
                        }

                        if( w >= nWords ) {
                            // WOW! This combination of operations on this lexicon results in NO collisions!!
                            // Report operations, some view of results and statistics,
                            // then formulate LUT as a string of '@' and 'A' to 'Z-ish'
                            printf( "%s (%s %s %s) hashes -", op->name, f0->name, f1->name, f2->name );
                            int min = hashes[0], max = hashes[0];
                            for( w = 0; w < nWords; w++ ) {

                                printf( " %2d", hashes[w] );
                                if( min > hashes[w] ) min = hashes[w];
                                if( max < hashes[w] ) max = hashes[w];
                            }
                            printf( "  min(%d) max(%d) range(%d) '%s'\n", min, max, max - min + 1, mkCodeStr( hashes, w ) );
                            foundSome = true;
                        }
                    }
                }
            }
        }
    }
}

static void demo( char *lst[], int num, int (*fnc)(char*) ) {
    // Output lexicon strings and each derived 'index' in two columns
    for( int i = 0; i < num ; i++ )
        printf( "%11s - %02d  #  %-5s - %02d\n", lst[ i ], fnc( lst[i] ), lst[ i+num ], fnc( lst[i+num] ) );
    printf( "\n" );
}


static int monthOrd( char *mn ) { return "DIE@CB@LJF@HAG@K"[ mn[1]/4&7 ^ mn[2]*2 &0xF ] &0xF; }

static void demo_MonthOrd( void ) {
// Convert a month name (or 3 letter abbrev.) to index (1-12). Beware false positives!
    char *ms[] = { 
        "January", "February", "March", "April", "May", "June", "July", "August", "Sept", "October", "Nov", "December",
        "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC",
    };
    const int num = sizeof(ms)/sizeof(ms[0]);

    demo( ms, num/2, monthOrd );

//  seekHash3chars( ms, num/2 ); // disabled for CR posting...
}


static int wkdayOrd( char cp[] ) { return "65013427"[*cp/2 + ~cp[1] & 0x7] & 0x7; }

static void demo_WkDayOrd( void ) {
// Convert a day name (or 3 letter abbrev.) to index (1-7, Sun=1). Beware false positives!
    char *ds[] = {
        "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday",
        "SU", "MO", "TU", "WE", "TH", "FR", "SA"
    };
    const int num = sizeof(ds)/sizeof(ds[0]);

    demo( ds, num/2, wkdayOrd );

    // Show searching for candidate operation combinations
    puts( "\n\nPress enter for demo of search in action" );  getchar();
    seekHash3chars( ds, num/2 ); // enabled for CR posting...
}



int main( void ) {
    printf( "demo_MonthOrd()\n" );  demo_MonthOrd();

    puts( "\n\nPress enter for demo of weekday lexicon" ); getchar();

    printf( "demo_WkDayOrd()\n" );  demo_WkDayOrd();

    return 0;
}

NB: In this version of the code, mkCodeStr() trims the LUT string to a minimum length, effectively discarding contiguous '@' characters to the right of the highest valid mapping alpha character. This works fine in a scenario in which the candidate being hashed and mapped is definitely a member of the lexicon (but which member?). For instance, when dealing with tokens like month names that appear at a known location in a row of a computer generated table of records. No "caller verification" would be required.

In uses wherein the candidate token could be any random word, its hash could result in a value (0-15 or 0-31), this truncation of the LUT string must be manually undone before use.

Example: this program may propose a 'trimmed' LUT string like "@@BADCE@FG" (10 chars). This would be fine if the candidates were guaranteed to be one of the lexicon's words. For dealing with any 'random' word, the LUT string needs to be padded on the right "@@BADCE@FG@@@@@@" (16 chars) as the candidate string has potential, after bit masking operations, to lead to any hash value from 0 to 15.

Add caveat regarding false positives

Source Link

edited Apr 17 at 22:14

user272752

Restating a caveat: These functions blindly deal with the first 3 characters in an array (a "string"). The result is a usable ordinal based on the lexicon provided, BUT false positives are unavoidable. The calling code must, if need be, verify (strcmp()) against only ONE member of the entire lexicon. The intent is to speed up the translation of candidate strings to useful ordinal values without recourse to 'groping' as performed by, for instance, a binary search of a list.

Source Link

asked Apr 17 at 21:55

user272752

Desperately seeking a hashing function

A recent CR exchange led me to dust off and adapt some code I'd written a few years ago. Long, long ago, when core meant ferro-magnetic memory cells, I spent an hour formulating a "simple" hash function combining it with a sparse LUT to convert month names to month ordinals. More recently, to pass a rainy afternoon, I developed the following code to search-out alternatives (perhaps simpler mangling of characters of lexicon words; perhaps discovering a "Perfect Hash" for that lexicon.)

This has been a "personal toy coding project" up until the recent revitalisation of its (possible) utility. "Recalled to Life!" for any Dickens fans in the audience.

This is presented here both as a humble offering to any who might find it useful, AND to solicit comments and suggestions as to how it might be improved. (The nested iterations are somewhat embarrassing, but I've lost the motivation to revise and refactor it to use recursion when permuting the mangling functions.)

Welcome would be any/all significant commentary (please, no "should use const" comments.) Especially welcome would be code of a revised version that does this all better than this currently does! For your pleasure...

For posting here, the code below utilises the "Month to Ordinal" function, then the "Weekday to Ordinal" function, then goes wild with hundreds of alternative hashing operations for the latter that also work. (It scrolls the screen somewhat, but does conclude.) Adding another quest using a different lexicon should be self-evident.

// SeekHash:
//  Permute a bunch of operations looking for a combination
//  that results in no collisions
//  when applied to a small lexicon of strings.
//  Report those hashing operations and the "string LUT" for each.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>

static char *mkCodeStr( int ph[], int nHashes ) {
    // Given a sparse array of small value integers and the useful element count,
    // make a string containing 'A', 'B', 'C', ... each at meaningful offsets.
    // Set unimportant bytes to '@' to achieve internal padding.
    // String used as LUT; maximum len is 32 (to suit SMALL lexicon only)
    // Yes, only 26 letters in alphabet, but...
    static char buf[32 + 1];

    memset( buf, '@', 32 ); // init "background"
    int maxHash = 0;
    for( int i = 0; i < nHashes; i++ ) {
        buf[ ph[i] ] = 'A' + i; // assign alpha representative into applicable element
        maxHash = ph[i] > maxHash ? ph[i] : maxHash; // remember max element assigned
    }
    buf[maxHash+1] = '\0'; // trim excess string length
    return buf;
}

typedef char (*fnc_t)( char );
// There are many 'trivial' bit operations possible to 'alter' a single byte/char.
static char bit2( char x ) { return (x>>1)&0x01; }
static char add1( char x ) { return x+1; }
static char sub1( char x ) { return x-1; }
static char invr( char x ) { return ~x; }
static char shR1( char x ) { return x>>1; }
static char shR2( char x ) { return x>>2; }
static char shL1( char x ) { return x<<1; }
static char shL2( char x ) { return x<<2; }
static char same( char x ) { return x; }
static char null( char x ) { return 0; }

typedef int (*op_t)( char, char, char );
// There are many operations to combine 3 bytes. These are a few of them.
static int sum_sum( char x, char y, char z ) { return x + y + z; }
static int xor_sum( char x, char y, char z ) { return x ^ y + z; }
static int sum_xor( char x, char y, char z ) { return x + y ^ z; }
static int lft_fld( char x, char y, char z ) { return sub1(x) + y ^ z; }

#define ENTRY(x) { #x, x, } // String-ize and the function address (pointer)

void seekHash3chars( char *wordList[], int nWords ) {
    // Each of leftmost 3 chars of every word will be subjected to a bit manipulation function.

    // 3 funcPtrs and an array of the "byte's bit mangling" operations available
    struct {
        char *name;
        fnc_t fnc;
    }  *f0, *f1, *f2, fncs[] = {
        ENTRY( null ),
        ENTRY( same ),
        ENTRY( shL1 ),
        ENTRY( shL2 ),
        ENTRY( shR1 ),
        ENTRY( shR2 ),
        ENTRY( invr ),
        ENTRY( sub1 ),
        ENTRY( add1 ),
        ENTRY( bit2 ),
    };
    const int nFncs = sizeof fncs / sizeof fncs[0];

    // List of "byte combining" operations (in same style as above)
    struct {
        char *name;
        op_t fnc;
    }  *op, ops[] = {
        ENTRY( sum_sum ),
        ENTRY( sum_xor ),
        ENTRY( xor_sum ),
        ENTRY( lft_fld ),
    };
    const int nOps = sizeof ops / sizeof ops[0];

    int hashes[32]; // Up to 32 distinct hash values, one for each lexicon entry
    unsigned int results = 0; // Up to 32 bit flags set as result of hash (to quickly detect/reject collisions)
    bool foundSome = false; // flag remains clear if no 4 bit operations work. Try 5 bits...

    for( int mask = 0xF; !foundSome && mask <= 0x1F; mask |= mask<<1 ) {
        // Try hashes between 0-15, then try larger hashes 0-32

        for( op = ops; op < &ops[nOps]; op++ ) {
            // Try each of the "byte combining" operations

            for( f0 = fncs; f0 < &fncs[nFncs]; f0++ ) {
                // Use each byte mangle on 1st character

                for( f1 = fncs; f1 < &fncs[nFncs]; f1++ ) {
                    // Use each byte mangle on 2nd character

                    for( f2 = fncs; f2 < &fncs[nFncs]; f2++ ) {
                        // Use each byte mangle on 3rd character

                        results = 0;
                        for( int w = 0; w < nWords; w++ ) {
                            // Go thru entire lexicon (eg 12 month names)
                            // Mangle first 3 characters
                            char x = f0->fnc(wordList[w][0] &0x1F ); // low 5 bits of each char
                            char y = f1->fnc(wordList[w][1] &0x1F );
                            char z = f2->fnc(wordList[w][2] &0x1F );

                            int possHash = op->fnc( x, y, z ) & mask; // Combine 3 results and mask to 4/5 bits

                            hashes[w] = possHash; // remember this hash value of this lexicon "word"

                            // If collision detected, stop trying immediately and try something else
                            if( (0x01 << possHash) & results )
                                break;

                            results |= (0x01 << possHash); // mark as used and continue with more words
                        }

                        if( w >= nWords ) {
                            // WOW! This combination of operations on this lexicon results in NO collisions!!
                            // Report operations, some view of results and statistics,
                            // then formulate LUT as a string of '@' and 'A' to 'Z-ish'
                            printf( "%s (%s %s %s) hashes -", op->name, f0->name, f1->name, f2->name );
                            int min = hashes[0], max = hashes[0];
                            for( w = 0; w < nWords; w++ ) {

                                printf( " %2d", hashes[w] );
                                if( min > hashes[w] ) min = hashes[w];
                                if( max < hashes[w] ) max = hashes[w];
                            }
                            printf( "  min(%d) max(%d) range(%d) '%s'\n", min, max, max - min + 1, mkCodeStr( hashes, w ) );
                            foundSome = true;
                        }
                    }
                }
            }
        }
    }
}

static void demo( char *lst[], int num, int (*fnc)(char*) ) {
    // Output lexicon strings and each derived 'index' in two columns
    for( int i = 0; i < num ; i++ )
        printf( "%11s - %02d  #  %-5s - %02d\n", lst[ i ], fnc( lst[i] ), lst[ i+num ], fnc( lst[i+num] ) );
    printf( "\n" );
}


static int monthOrd( char *mn ) { return "DIE@CB@LJF@HAG@K"[ mn[1]/4&7 ^ mn[2]*2 &0xF ] &0xF; }

static void demo_MonthOrd( void ) {
// Convert a month name (or 3 letter abbrev.) to index (1-12). Beware false positives!
    char *ms[] = { 
        "January", "February", "March", "April", "May", "June", "July", "August", "Sept", "October", "Nov", "December",
        "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC",
    };
    const int num = sizeof(ms)/sizeof(ms[0]);

    demo( ms, num/2, monthOrd );

//  seekHash3chars( ms, num/2 ); // disabled for CR posting...
}


static int wkdayOrd( char cp[] ) { return "65013427"[*cp/2 + ~cp[1] & 0x7] & 0x7; }

static void demo_WkDayOrd( void ) {
// Convert a day name (or 3 letter abbrev.) to index (1-7, Sun=1). Beware false positives!
    char *ds[] = {
        "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday",
        "SU", "MO", "TU", "WE", "TH", "FR", "SA"
    };
    const int num = sizeof(ds)/sizeof(ds[0]);

    demo( ds, num/2, wkdayOrd );

    // Show searching for candidate operation combinations
    puts( "\n\nPress enter for demo of search in action" );  getchar();
    seekHash3chars( ds, num/2 ); // enabled for CR posting...
}



int main( void ) {
    printf( "demo_MonthOrd()\n" );  demo_MonthOrd();

    puts( "\n\nPress enter for demo of weekday lexicon" ); getchar();

    printf( "demo_WkDayOrd()\n" );  demo_WkDayOrd();

    return 0;
}

Acknowledging that 'gperf' will write the code to achieve a speed-efficient "perfect hashing result". The examples I've seen, however, seem to consider memory to be a large resource. Perhaps unsuitable for small microcontroller code...

c hash-map

Stack Exchange Network

Return to Question

Desperately seeking a hashing function