// SeekHash:
// Permute a bunch of operations looking for a combination
// that results in no collisions
// when applied to a small lexicon of strings.
// Report those hashing operations and the "string LUT" for each.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
static char *mkCodeStr( int ph[], int nHashes ) {
// Given a sparsean array of small value integers and the useful element count,
// make a string containing 'A', 'B', 'C', ... each at meaningful offsets.
// Set unimportant bytes to '@' to achieve internal padding.
// String used as LUT; maximum len is 32 (to suit SMALL lexicon only)
// Yes, only 26 letters in alphabet, but...
static char buf[32 + 1];
memset( buf, '@', 32 ); // init "background"
int maxHash = 0;
for( int i = 0; i < nHashes; i++ ) {
buf[ ph[i] ] = 'A' + i; // assign alpha representative into applicable element
maxHash = ph[i] > maxHash ? ph[i] : maxHash; // remember max element assigned
}
buf[maxHash+1] = '\0'; // trim excess string length
return buf;
}
typedef char (*fnc_t)( char );
// There are many 'trivial' bit operations possible to 'alter' a single byte/char.
static char bit2( char x ) { return (x>>1)&0x01; }
static char add1( char x ) { return x+1; }
static char sub1( char x ) { return x-1; }
static char invr( char x ) { return ~x; }
static char shR1( char x ) { return x>>1; }
static char shR2( char x ) { return x>>2; }
static char shL1( char x ) { return x<<1; }
static char shL2( char x ) { return x<<2; }
static char same( char x ) { return x; }
static char null( char x ) { return 0; }
typedef int (*op_t)( char, char, char );
// There are many operations to combine 3 bytes. These are a few of them.
static int sum_sum( char x, char y, char z ) { return x + y + z; }
static int xor_sum( char x, char y, char z ) { return x ^ y + z; }
static int sum_xor( char x, char y, char z ) { return x + y ^ z; }
static int lft_fld( char x, char y, char z ) { return sub1(x) + y ^ z; }
#define ENTRY(x) { #x, x, } // String-ize and the function address (pointer)
void seekHash3chars( char *wordList[], int nWords ) {
// Each of leftmost 3 chars of every word will be subjected to a bit manipulation function.
// 3 funcPtrs and an array of the "byte's bit mangling" operations available
struct {
char *name;
fnc_t fnc;
} *f0, *f1, *f2, fncs[] = {
ENTRY( null ),
ENTRY( same ),
ENTRY( shL1 ),
ENTRY( shL2 ),
ENTRY( shR1 ),
ENTRY( shR2 ),
ENTRY( invr ),
ENTRY( sub1 ),
ENTRY( add1 ),
ENTRY( bit2 ),
};
const int nFncs = sizeof fncs / sizeof fncs[0];
// List of "byte combining" operations (in same style as above)
struct {
char *name;
op_t fnc;
} *op, ops[] = {
ENTRY( sum_sum ),
ENTRY( sum_xor ),
ENTRY( xor_sum ),
ENTRY( lft_fld ),
};
const int nOps = sizeof ops / sizeof ops[0];
int hashes[32]; // Up to 32 distinct hash values, one for each lexicon entry
unsigned int results = 0; // Up to 32 bit flags set as result of hash (to quickly detect/reject collisions)
bool foundSome = false; // flag remains clear if no 4 bit operations work. Try 5 bits...
for( int mask = 0xF; !foundSome && mask <= 0x1F; mask |= mask<<1 ) {
// Try hashes between 0-15, then try larger hashes 0-3231
for( op = ops; op < &ops[nOps]; op++ ) {
// Try each of the "byte combining" operations
for( f0 = fncs; f0 < &fncs[nFncs]; f0++ ) {
// Use each byte mangle on 1st character
for( f1 = fncs; f1 < &fncs[nFncs]; f1++ ) {
// Use each byte mangle on 2nd character
for( f2 = fncs; f2 < &fncs[nFncs]; f2++ ) {
// Use each byte mangle on 3rd character
results = 0;
for( int w = 0; w < nWords; w++ ) {
// Go thru entire lexicon (eg 12 month names)
// Mangle first 3 characters
char x = f0->fnc(wordList[w][0] &0x1F ); // low 5 bits of each char
char y = f1->fnc(wordList[w][1] &0x1F );
char z = f2->fnc(wordList[w][2] &0x1F );
int possHash = op->fnc( x, y, z ) & mask; // Combine 3 results and mask to 4/5 bits
hashes[w] = possHash; // remember this hash value of this lexicon "word"
// If collision detected, stop trying immediately and try something else
if( (0x01 << possHash) & results )
break;
results |= (0x01 << possHash); // mark as used and continue with more words
}
if( w >= nWords ) {
// WOW! This combination of operations on this lexicon results in NO collisions!!
// Report operations, some view of results and statistics,
// then formulate LUT as a string of '@' and 'A' to 'Z-ish'
printf( "%s (%s %s %s) hashes -", op->name, f0->name, f1->name, f2->name );
int min = hashes[0], max = hashes[0];
for( w = 0; w < nWords; w++ ) {
printf( " %2d", hashes[w] );
if( min > hashes[w] ) min = hashes[w];
if( max < hashes[w] ) max = hashes[w];
}
printf( " min(%d) max(%d) range(%d) '%s'\n", min, max, max - min + 1, mkCodeStr( hashes, w ) );
foundSome = true;
}
}
}
}
}
}
}
static void demo( char *lst[], int num, int (*fnc)(char*) ) {
// Output lexicon strings and each derived 'index' in two columns
for( int i = 0; i < num ; i++ )
printf( "%11s - %02d # %-5s - %02d\n", lst[ i ], fnc( lst[i] ), lst[ i+num ], fnc( lst[i+num] ) );
printf( "\n" );
}
static int monthOrd( char *mn ) { return "DIE@CB@LJF@HAG@K"[ mn[1]/4&7 ^ mn[2]*2 &0xF ] &0xF; }
static void demo_MonthOrd( void ) {
// Convert a month name (or 3 letter abbrev.) to index (1-12). Beware false positives!
char *ms[] = {
"January", "February", "March", "April", "May", "June", "July", "August", "Sept", "October", "Nov", "December",
"JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC",
};
const int num = sizeof(ms)/sizeof(ms[0]);
demo( ms, num/2, monthOrd );
// seekHash3chars( ms, num/2 ); // disabled for CR posting...
}
static int wkdayOrd( char cp[] ) { return "65013427"[*cp/2 + ~cp[1] & 0x7] & 0x7; }
static void demo_WkDayOrd( void ) {
// Convert a day name (or 3 letter abbrev.) to index (1-7, Sun=1). Beware false positives!
char *ds[] = {
"Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday",
"SU", "MO", "TU", "WE", "TH", "FR", "SA"
};
const int num = sizeof(ds)/sizeof(ds[0]);
demo( ds, num/2, wkdayOrd );
// Show searching for candidate operation combinations
puts( "\n\nPress enter for demo of search in action" ); getchar();
seekHash3chars( ds, num/2 ); // enabled for CR posting...
}
int main( void ) {
printf( "demo_MonthOrd()\n" ); demo_MonthOrd();
puts( "\n\nPress enter for demo of weekday lexicon" ); getchar();
printf( "demo_WkDayOrd()\n" ); demo_WkDayOrd();
return 0;
}
NB: In this version of the code, mkCodeStr() trims the LUT string to a minimum length, effectively discarding contiguous '@' characters to the right of the highest valid mapping alpha character. This works fine in a scenario in which the candidate being hashed and mapped is definitely a member of the lexicon (but which member?). For instance, when dealing with tokens like month names that appear at a known location in a row of a computer generated table of records. No "caller verification" would be required.
In uses wherein the candidate token could be any random word, its hash could result in a value (0-15 or 0-31), this truncation of the LUT string must be manually undone before use.
Example: this program may propose a 'trimmed' LUT string like "@@BADCE@FG" (10 chars). This would be fine if the candidates were guaranteed to be one of the lexicon's words. For dealing with any 'random' word, the LUT string needs to be padded on the right "@@BADCE@FG@@@@@@" (16 chars) as the candidate string has potential, after bit masking operations, to lead to any hash value from 0 to 15.