Optimised code to compute Huffman codes when implementing RFC 1951

Question

I found a trick to significantly reduce the time needed to compute Huffman codes in my implementation of RFC1951. I was slightly surprised to find it slightly improves compression as well, something I had not anticipated.

The trick is to encode the required information about a tree node in a ulong, arranged so that the ulong values can be compared directly, rather than by calling a delegate function which accessed related arrays to get the frequency and depth. This has a substantial effect on the performance.

The symbol or tree node id goes in the bottom 16 bits, the tree depth in the next 8 bits, and the frequency ( how many times the tree node is used ) in the remaining 40 bits. The revised code is below.

I think the reason compression is improved is that the compare now includes the symbol id, this tends to assign the same number of bits to adjacent symbols if they have the same frequency, and that in turn tends to reduce the number of bits needed to encode the lengths.

Here's the revised code:

struct HuffmanCoding // Variable length coding.
{
  public ushort Count; // Number of used symbols.
  public byte [] Bits; // Number of bits used to encode a symbol.
  public ushort [] Codes; // Huffman code for a symbol ( bit 0 is most significant ).
  public int [] Used; // Count of how many times a symbol is used in the block being encoded.

  private int Limit; // Maximum number of bits for a code.
  private ushort [] Tree;

  public HuffmanCoding( int limit, ushort symbols )
  {
    Limit = limit;
    Count = symbols;
    Bits = new byte[ symbols ];
    Codes = new ushort[ symbols ];
    Used = new int[ symbols * 2 ]; // Second half of array is for tree nodes.
    Tree = new ushort[ symbols * 2] ; // First half is one branch, second half is other branch.
  }

  public int Total()
  {
    int result = 0, count = Count;
    for ( int i = 0; i < count; i += 1 ) 
      result += Used[i] * Bits[i];
    return result;
  }

  public bool ComputeCodes() // returns true if Limit is exceeded.
  {
    ushort count = Count;

    UlongHeap heap = new UlongHeap( count );

    for ( ushort i = 0; i < Count; i += 1 )
    {
      int used = Used[ i ];
      if ( used > 0 )
      {
        // The values are encoded as 16 bits for the symbol, 8 bits for the depth, then 32 bits for the frequency.
        heap.Insert( ( (ulong)used << 24 ) + i );
      }
    }

    int maxBits = 0;

    if ( heap.Count == 1 )
    { 
      GetBits( (ushort) heap.Remove(), 1 );
      maxBits = 1;
    }
    else if ( heap.Count > 1 )
    {
      ulong treeNode = Count;

      do // Keep pairing the lowest frequency TreeNodes.
      {
        ulong left = heap.Remove(); 
        Tree[ treeNode - Count ] = (ushort) left;

        ulong right = heap.Remove(); 
        Tree[ treeNode ] = (ushort) right;

        // Extract depth of left and right nodes ( depth is encoded as bits 16..23 ).
        uint dleft = (uint)left & 0xff0000u, dright = (uint)right & 0xff0000u; 
        uint depth = ( dleft > dright ? dleft : dright ) + 0x10000u;

        heap.Insert( ( ( left + right ) & 0xffffffffff000000 ) | depth | treeNode );

        treeNode += 1;
      }  while ( heap.Count > 1 );

      uint root = ( (uint) heap.Remove() ) & 0xffffff;
      maxBits = (int)( root >> 16 );
      if ( maxBits > Limit ) return true;
      GetBits( (ushort)root, 0 ); // Walk the tree to find the code lengths (Bits).
    }

    // Compute codes, code below is from RFC 1951 page 7.

    int [] bl_count = new int[ maxBits + 1 ];
    for ( int i = 0; i < count; i += 1 ) bl_count[ Bits[ i ] ] += 1;

    int [] next_code = new int[ maxBits + 1 ];
    int code = 0; bl_count[ 0 ] = 0;
    for ( int i = 0; i < maxBits; i += 1 ) 
    {
      code = ( code + bl_count[ i ] ) << 1;
      next_code[ i+1 ] = code;
    }

    for ( int i = 0; i < count; i += 1 ) 
    {
      int length = Bits[ i ];
      if ( length != 0 ) 
      {
        Codes[ i ] = (ushort)Reverse( next_code[ length ], length );
        next_code[ length ] += 1;
      }
    }

    // Reduce count if there are unused symbols.
    while ( count > 0 && Bits[ count - 1 ] == 0 ) count -= 1;
    Count = count;

    // System.Console.WriteLine( "HuffEncoder.ComputeCodes" );
    //     for ( int i = 0; i < count; i += 1 ) if ( Bits[ i ] > 0 )
    //      System.Console.WriteLine( "i=" + i + " len=" + Bits[ i ] + " tc=" + Codes[ i ].ToString("X") + " freq=" + Used[ i ] );

    return false;
  }

  private void GetBits( ushort treeNode, int length )
  {
    if ( treeNode < Count ) // treeNode is a leaf.
    {
      Bits[ treeNode ] = (byte)length;
    }
    else 
    {
      length += 1;
      GetBits( Tree[ treeNode - Count ], length );
      GetBits( Tree[ treeNode ], length );
    }
  }

  private static int Reverse( int x, int bits )
  // Reverse a string of bits ( ready to be output as Huffman code ).
  { 
    int result = 0; 
    for ( int i = 0; i < bits; i += 1 ) 
    {
      result <<= 1; 
      result |= x & 1; 
      x >>= 1; 
    } 
    return result; 
  } 

} // end struct HuffmanCoding


// ******************************************************************************


struct UlongHeap // An array organised so the smallest element can be efficiently removed.
{
  public int Count { get{ return _Count; } }
  private int _Count;
  private ulong [] Array;

  public UlongHeap ( int capacity )
  {
    _Count = 0;
    Array = new ulong[ capacity ];
  }

  public void Insert( ulong e )
  {
    int j = _Count++;
    while ( j > 0 )
    {
      int p = ( j - 1 ) >> 1; // Index of parent.
      ulong pe = Array[ p ];
      if ( e >= pe ) break;
      Array[ j ] = pe; // Demote parent.
      j = p;
    }    
    Array[ j ] = e;
  }

  public ulong Remove() // Returns the smallest element.
  {
    ulong result = Array[ 0 ];
    _Count -= 1;
    ulong e = Array[ _Count ];
    int j = 0;
    while ( true )
    {
      int c = ( j + j ) + 1; if ( c >= _Count ) break;
      ulong ce = Array[ c ];
      if ( c + 1 < _Count )
      {
        ulong ce2 = Array[ c + 1 ];
        if ( ce2 < ce ) { c += 1; ce = ce2; }
      } 
      if ( ce >= e ) break;
      Array[ j ] = ce; j = c;  
    }
    Array[ j ] = e;
    return result;
  }

} // end struct Heap

Nice trick, but there are also completely treeless approaches such as Engel coding — user555045
– user555045, Commented Jan 25, 2019 at 19:50
Thanks. I just started looking at engel coding. Up to now, I have relied on reducing the block size if the code length limit is exceeded, but I could do with a better approach. — George Barwood
– George Barwood, Commented Jan 25, 2019 at 20:19

Maxim · Accepted Answer · 2019-01-25 18:44:39Z

I see it's not the first post from you for the last week. Here some comments for all of your pieces of code.

Strange assignments:
```
public ushort Count;
```
...
```
public bool ComputeCodes() // returns true if Limit is exceeded.
{
    ushort count = Count;
```
What purpose of the last line? Count already ushort and it's a field, not a property (which is a case where this assignment can has sense).
A lot of spaces:
```
UlongHeap heap = new UlongHeap( count );

for ( ushort i = 0; i < Count; i += 1 )
```
Don't use spaces after ( and [, and before ) and ]. They add noise to the code.
Don't use PascalCasing for private fields names. Many programmers use _ prefix to indicate private field.
Use var where it's appropriate. For example, here
```
UlongHeap heap = new UlongHeap( count );
```
it's obvious that heap will be of type UlongHeap. So this
```
var heap = new UlongHeap( count );
```
is more clear without redundant type specification.
Increments can be written as ++ instead of += 1.
It's better to use properties instead of public fields. Also properties like this
```
public int Count { get{ return _Count; } }
```
can be rewritten as
```
public int Count => _Count;
```
Use full names for variables and method parameters. So instead of dleft, dright and so on use depthLeft, depthRight.
Use consistent naming. You have variables treeNode, maxBits, bl_count, next_code. Use camelCasing (like maxBits) without underscores.
Magic numbers should be defined as constants. What is 0xff0000u or 0xffffffffff000000?
Place bodies of if statements on next line, like so
```
if (maxBits > Limit)
    return true;
```
Check arguments of public methods. For example, here
```
public HuffmanCoding( int limit, ushort symbols )
```
what will happen if user pass negative number to limit? You can check value and throw ArgumentOutOfRangeException with a message describing error if a value is invalid.

George Barwood · Accepted Answer · 2019-01-26 14:06:52Z

Here's a revised version. I have added Package Merge code to handle the situation where the bit limit is exceeded ( rather than returning failure ). I have implemented many of Maxim's suggestions but not all.

On spacing I don't agree yet, but I will think about it. I use a leading _ for private fields only when there is a property clash. I prefer to stick to the earliest versions of C# unless there is a strong reason not to, so I don't use var or the => notation for properties.

This is an internal struct, so although it could check arguments, I think it's not essential, this is also why there are public fields. The weird names such as bl_count are from RFC 1951, I considered changing them, but I feel it makes it easier to compare my code with the standard.

struct HuffmanCoding // Variable length coding.
{
  public ushort Count; // Number of used symbols.
  public byte [] Bits; // Number of bits used to encode a symbol ( code length ).
  public ushort [] Codes; // Huffman code for a symbol ( bit 0 is most significant ).
  public int [] Used; // Count of how many times a symbol is used in the block being encoded.

  private int Limit; // Limit on code length ( 15 or 7 for RFC 1951 ).
  private ushort [] Left, Right; // Tree storage.

  public HuffmanCoding( int limit, ushort symbols )
  {
    Limit = limit;
    Count = symbols;
    Bits = new byte[ symbols ];
    Codes = new ushort[ symbols ];
    Used = new int[ symbols ];
    Left = new ushort[ symbols ];
    Right = new ushort[ symbols ];
  }

  public int Total()
  {
    int result = 0;
    for ( int i = 0; i < Count; i += 1 ) 
      result += Used[i] * Bits[i];
    return result;
  }

  public void ComputeCodes()
  {
    // Tree nodes are encoded in a ulong using 16 bits for the id, 8 bits for the tree depth, 32 bits for Used.
    const int IdBits = 16, DepthBits = 8, UsedBits = 32;
    const uint IdMask = ( 1u << IdBits ) - 1;
    const uint DepthOne = 1u << IdBits;
    const uint DepthMask = ( ( 1u << DepthBits ) - 1 ) << IdBits;
    const ulong UsedMask = ( ( 1ul << UsedBits ) - 1 ) << ( IdBits + DepthBits );

    // First compute the number of bits to encode each symbol (Bits).
    UlongHeap heap = new UlongHeap( Count );

    for ( ushort i = 0; i < Count; i += 1 )
    {
      int used = Used[ i ];
      if ( used > 0 )
        heap.Insert( ( (ulong)used << ( IdBits + DepthBits ) ) | i );
    }

    int maxBits = 0;

    if ( heap.Count == 1 )
    { 
      GetBits( unchecked( (ushort) heap.Remove() ), 1 );
      maxBits = 1;
    }
    else if ( heap.Count > 1 ) unchecked
    {
      ulong treeNode = Count;

      do // Keep pairing the lowest frequency TreeNodes.
      {
        ulong left = heap.Remove(); 
        Left[ treeNode - Count ] = (ushort) left;

        ulong right = heap.Remove(); 
        Right[ treeNode - Count ] = (ushort) right;

        // Extract depth of left and right nodes ( still shifted though ).
        uint depthLeft = (uint)left & DepthMask, depthRight = (uint)right & DepthMask; 

        // New node depth is 1 + larger of depthLeft and depthRight.
        uint depth = ( depthLeft > depthRight ? depthLeft : depthRight ) + DepthOne;

        heap.Insert( ( ( left + right ) & UsedMask ) | depth | treeNode );

        treeNode += 1;
      }  while ( heap.Count > 1 );

      uint root = ( (uint) heap.Remove() ) & ( DepthMask | IdMask );
      maxBits = (int)( root >> IdBits );
      if ( maxBits <= Limit )
        GetBits( (ushort)root, 0 );
      else
      {
        maxBits = Limit;
        PackageMerge();
      }
    }

    // Computation of code lengths (Bits) is complete.
    // Now compute Codes, code below is from RFC 1951 page 7.

    int [] bl_count = new int[ maxBits + 1 ];
    for ( int i = 0; i < Count; i += 1 ) 
      bl_count[ Bits[ i ] ] += 1; 

    int [] next_code = new int[ maxBits + 1 ];
    int code = 0; bl_count[ 0 ] = 0;
    for ( int i = 0; i < maxBits; i += 1 ) 
    {
      code = ( code + bl_count[ i ] ) << 1;
      next_code[ i + 1 ] = code;
    }

    for ( int i = 0; i < Count; i += 1 ) 
    {
      int length = Bits[ i ];
      if ( length != 0 ) 
      {
        Codes[ i ] = (ushort) Reverse( next_code[ length ], length );
        next_code[ length ] += 1;
      }
    }

    // Reduce Count if there are unused symbols.
    while ( Count > 0 && Bits[ Count - 1 ] == 0 ) Count -= 1;

    // System.Console.WriteLine( "HuffmanCoding.ComputeCodes" );
    //   for ( int i = 0; i < Count; i += 1 ) if ( Bits[ i ] > 0 )
    //     System.Console.WriteLine( "symbol=" + i + " len=" + Bits[ i ] + " code=" + Codes[ i ].ToString("X") + " used=" + Used[ i ] );

  }

  private void GetBits( ushort treeNode, int length )
  {
    if ( treeNode < Count ) // treeNode is a leaf.
    {
      Bits[ treeNode ] = (byte)length;
    }
    else 
    {
      treeNode -= Count;
      length += 1;
      GetBits( Left[ treeNode ], length );
      GetBits( Right[ treeNode ], length );
    }
  }

  private static int Reverse( int x, int bits )
  // Reverse a string of bits ( ready to be output as Huffman code ).
  { 
    int result = 0; 
    for ( int i = 0; i < bits; i += 1 ) 
    {
      result <<= 1; 
      result |= x & 1; 
      x >>= 1; 
    } 
    return result; 
  } 

  // PackageMerge is used if the Limit code length limit is reached.
  // The result is technically not a Huffman code in this case ( due to the imposed limit ).
  // See https://en.wikipedia.org/wiki/Package-merge_algorithm for a description of the algorithm.

  private void PackageMerge()
  {
    // Tree nodes are encoded in a ulong using 16 bits for the id, 32 bits for Used.
    const int IdBits = 16, UsedBits = 32;
    const ulong UsedMask = ( ( 1ul << UsedBits ) - 1 ) << IdBits;

    Left = new ushort[ Count * Limit ];
    Right = new ushort[ Count * Limit ];

    // Fisrt sort using Heapsort.
    UlongHeap heap = new UlongHeap( Count );
    for ( uint i = 0; i < Count; i += 1 ) 
    {
      if ( Used[ i ] != 0 ) 
      {
        heap.Insert( (ulong)Used[ i ] << IdBits | i );
      }
    }
    int n = heap.Count; 
    ulong [] sorted = new ulong[ n ];
    for ( int i = 0; i < n; i += 1 ) sorted[ i ] = heap.Remove();

    // List class is from System.Collections.Generic.
    List<ulong> merged = new List<ulong>( Count ), 
                next = new List<ulong>( Count );

    uint package = (uint) Count; // Allocator for package ids.

    for ( int i = 0; i < Limit; i += 1 ) 
    {
      int j = 0, k = 0; // Indexes into the lists being merged.
      next.Clear();
      for ( int total = ( sorted.Length + merged.Count ) / 2; total > 0; total -= 1 )  
      {
        ulong left, right; // The tree nodes to be packaged.

        if ( k < merged.Count )
        {
          left = merged[ k ];
          if ( j < sorted.Length )
          {
            ulong sj = sorted[ j ];
            if ( left < sj ) k += 1;
            else { left = sj; j += 1; }
          }
          else k += 1;
        }
        else left = sorted[ j++ ];

        if ( k < merged.Count )
        {
          right = merged[ k ];
          if ( j < sorted.Length )
          {
            ulong sj = sorted[ j ];
            if ( right < sj ) k += 1;
            else { right = sj; j += 1; }
          }
          else k += 1;
        }
        else right = sorted[ j++ ];

        Left[ package ] = unchecked( (ushort) left );
        Right[ package ] = unchecked( (ushort) right );
        next.Add( ( left + right ) & UsedMask | package );        
        package += 1;
      }

      // Swap merged and next.
      List<ulong> tmp = merged; merged = next; next = tmp;
    }

    for ( int i = 0; i < merged.Count; i += 1 )
      MergeGetBits( unchecked( (ushort) merged[i] ) );
  }

  private void MergeGetBits( ushort node )
  {
    if ( node < Count )
      Bits[ node ] += 1;
    else
    {
      MergeGetBits( Left[ node ] );
      MergeGetBits( Right[ node ] );
    }
  }

} // end struct HuffmanCoding


// ******************************************************************************


struct UlongHeap // An array organised so the smallest element can be efficiently removed.
{
  public int Count { get{ return _Count; } }
  private int _Count;
  private ulong [] Array;

  public UlongHeap ( int capacity )
  {
    _Count = 0;
    Array = new ulong[ capacity ];
  }

  public void Insert( ulong e )
  {
    int j = _Count++;
    while ( j > 0 )
    {
      int p = ( j - 1 ) >> 1; // Index of parent.
      ulong pe = Array[ p ];
      if ( e >= pe ) break;
      Array[ j ] = pe; // Demote parent.
      j = p;
    }    
    Array[ j ] = e;
  }

  public ulong Remove() // Returns the smallest element.
  {
    ulong result = Array[ 0 ];
    _Count -= 1;
    ulong e = Array[ _Count ];
    int j = 0;
    while ( true )
    {
      int c = ( j + j ) + 1; if ( c >= _Count ) break;
      ulong ce = Array[ c ];
      if ( c + 1 < _Count )
      {
        ulong ce2 = Array[ c + 1 ];
        if ( ce2 < ce ) { c += 1; ce = ce2; }
      } 
      if ( ce >= e ) break;
      Array[ j ] = ce; j = c;  
    }
    Array[ j ] = e;
    return result;
  }

} // end struct UlongHeap

Stack Exchange Network

Optimised code to compute Huffman codes when implementing RFC 1951

2 Answers 2

You must log in to answer this question.

Linked

Hot Network Questions

Optimised code to compute Huffman codes when implementing RFC 1951

2 Answers 2

You must log in to answer this question.

Linked

Related

Hot Network Questions