Python 3.x Hexdump

Question

I posted a small hexdump generator function from a program I've been writing not long ago and applied what a reviewer suggested since then. The goal was to lazily hexdump bytes objects (byte strings, binary files without committing to I/O code).

Here is the relevant code (minus docstrings/tests/script), with some description below:

import re
from itertools import islice


class HexdumpGenerator:
    def __init__(self, iterable, base_addr=0, start=0, stop=None, step=16, sep='\b'):
        self.iterable = islice(iterable, start, stop)
        self.base_addr = base_addr
        self.start = start
        self.stop = stop
        self.step = step
        self.col0 = '08X'
        self.col1 = '02X'
        self.fmt = '{}   {}  {}'
        self.placeholder = ['  ']
        self._sep = sep[0]
        self._mod = (base_addr + start) % self.step
        self._next = start + self.step - self._mod

    def __iter__(self):
        while True:
            row = bytearray(islice(self.iterable, self._next - self.start))
            if not row:
                break
            col0 = format(self.base_addr + self.start - self._mod, self.col0)
            col1 = self._mod * self.placeholder
            col2 = self._mod * ' '
            for byte in row:
                ch = chr(byte)
                col1 += [format(byte, self.col1)]
                col2 += ch if ch.isprintable() else '.'
            self._mod = self.step - len(col1)
            col1 += self._mod * self.placeholder
            col2 += self._mod * ' '
            col1.insert(self.step // 2, self._sep)
            yield self.fmt.format(col0, ' '.join(col1), col2)
            self.start = self._next
            self._next += self.step


class CompressHexdumpGenerator(HexdumpGenerator):
    def __init__(self, *args, **kwargs):
        super(CompressHexdumpGenerator, self).__init__(*args, **kwargs)
        self.row = ''
        self.delimiter = ' '
        self.duplicates = 0

    def _compress(self):
        index = self.row.index(self.delimiter)
        col0 = int(self.row[:index], 16)
        col0 += self.duplicates * self.step
        return format(col0, self.col0) + self.row[index:]

    def __iter__(self):
        for i in super().__iter__():
            if self.row.split()[1:] == i.split()[1:]:
                if not self.duplicates:
                    yield '*'
                self.duplicates += 1
            else:
                yield i
                self.row = i
                self.duplicates = 0
        if self.duplicates:
            yield self._compress()


class FromHexdumpGenerator(CompressHexdumpGenerator):
    def __init__(self, *args, **kwargs):
        super(FromHexdumpGenerator, self).__init__(*args, **kwargs)
        self.base = 16
        self.len = '3'

    def get_repr(self, _row):
        row = bytearray()
        for i in _row[2:self.step * 2 + 1]:
            if i.isalnum():
                row.append(int(i, self.base))
            elif re.match('(\s{' + self.len + ',})', i):
                break
        return row

    def decompress_gen(self, row0, row1):
        i = int(row0[0].rstrip(self.delimiter), 16) + self.step
        j = int(row1[0].rstrip(self.delimiter), 16)
        while not i >= j:
            row = format(i, self.col0) + self.delimiter
            row = [row.rstrip(' ')] + row0[1:]
            yield self.get_repr(row)
            i += self.step

    def __iter__(self):
        i = j = ''
        while True:
            row = j if j else next(self.iterable, None)
            if row is None:
                break
            elif row == '*' or row == '*\n':
                j = next(self.iterable)
                yield from self.decompress_gen(i, j.split())
            else:
                index = row.find(self._sep)
                i = row[:index] + row[index + 1:]
                i = re.split('(\s+)', i)
                j = ''
                yield self.get_repr(i)

Utility functions:

from itertools import chain


def read_binary_gen(file):
    with open(file, 'rb') as f:
        yield from chain.from_iterable(f)


def write(file, gen):
    with open(file, 'w') as f:
        for i in gen:
            f.write(i + '\n')


def read_gen(file):
    with open(file, 'r') as f:
        yield from f


def write_binary(file, gen):
    with open(file, 'wb') as f:
        for i in gen:
            f.write(i)

read_binary_gen() is meant to be passed to the first two generator classes, while read_gen to the latter, thus not reading a file into memory.

I've tested the code with different formats (03o, 03d): if specifying 03d, then the placeholder attribute must be assigned a list with a single string composed of 3 spaces. Using FromHexdumpGenerator to undo the hexdump would then require that base be assigned the integer 10, and len the number '4' (3 + 1). The col0 attribute must remain hex (as it is an address).

If fmt's first column ends with a colon (as I've seen other programs use), the delimiter must be set to that value.

I struggled with different encodings before figuring out this was a case for bytearray, so if a bytes object can be dumped, it can be undumped.

EDIT

@Peilonrayz I attempted to fix the issues you pointed out and came up with the following (this is the whole module, with some additional functionality I'd been working on):

from itertools import islice, takewhile, tee, chain
from re import match, split
from copy import copy
from colorama import init, Style

tee = tee
init()

COL0 = '08X'
COL1 = '02X'
PAD = '  '
FMT = '{}   {}  {}'
DLM = ' '
BASE = 16


def change_format(col0, col1, fmt=''):
    global COL0, COL1, PAD, FMT, DLM, BASE
    COL0 = col0
    COL1 = col1
    PAD = ' ' * int(col1[:-1])
    if fmt:
        FMT = fmt
        DLM = fmt[fmt.index('}') + 1]
    BASE = {'b': 2, 'o': 8, 'd': 10, 'x': 16, 'X': 16}[col1[-1]]


def fix(it, offset, start, stop, step):
    n = (offset + start) % step
    return islice(it, start, stop), n, start + step - n


def mk_row(it, nxt, start):
    return bytearray(islice(it, nxt - start))


def to_hex(i):
    return format(i, COL1)


def to_chr(i):
    i = chr(i)
    return i if i.isprintable() else '.'


def pad_gen(fn, pad, it, n, step):
    count = 0
    for i in range(n):
        yield pad
    for i in it:
        yield fn(i)
        count += 1
    while count < step - n:
        yield pad
        count += 1


def hexdump_gen(it, offset=0, start=0, stop=None, step=16, sep='\b'):
    it, n, nxt = fix(it, offset, start, stop, step)
    while True:
        row = mk_row(it, nxt, start)
        if not row:
            break
        args = row, n, step
        col0 = format(offset + start - n, COL0)
        col1 = list(pad_gen(to_hex, PAD, *args))
        col2 = ''.join(pad_gen(to_chr, ' ', *args))
        col1.insert(step // 2, sep)
        yield FMT.format(col0, ' '.join(col1), col2)
        start = nxt
        nxt += step
        n = 0


def compress_hexdump_gen(*args, **kwargs):
    row = ''
    duplicates = 0
    for i in hexdump_gen(*args, **kwargs):
        if row.split()[1:] == i.split()[1:]:
            if not duplicates:
                yield '*'
            duplicates += 1
        else:
            yield i
            row = i
            duplicates = 0
    if duplicates > 1:
        index = row.index(DLM)
        col0 = int(row[:index], 16)
        col0 += duplicates * kwargs.get('step', 16)
        yield format(col0, COL0) + row[index:]


def predicate(i):
    return not match('(\s{3,})', i)


def highlight(row, sep, ba):
    for n, i in enumerate(takewhile(predicate, copy(row))):
        if ' ' not in i and i != sep:
            j = int(i, BASE)
            if j in ba or (not ba and chr(j).isprintable()):
                row[n] = Style.BRIGHT + i + Style.RESET_ALL


def highlight_hexdump_gen(it, step=16, sep='\b', ba=b''):
    ba = bytearray(ba)
    index = step * 2 + (1 if not sep else 3)
    for i in it:
        row0 = split('(\s+)', i)
        row1 = row0[2:index]
        highlight(row1, sep, ba)
        yield ''.join(row0[:2] + row1 + row0[index:])


def to_bytes(row, step):
    ba = bytearray()
    for i in takewhile(predicate, row[2:step * 2 + 1]):
        i = i.replace(Style.BRIGHT, '').replace(Style.RESET_ALL, '')
        if i.isalnum():
            ba.append(int(i, BASE))
    return ba


def decompress_gen(row0, row1, step):
    i = int(row0[0].rstrip(DLM), 16) + step
    j = int(row1[0].rstrip(DLM), 16)
    while not i >= j:
        row = [(format(i, COL0) + DLM).rstrip(' ')] + row0[1:]
        yield to_bytes(row, step)
        i += step


def from_hexdump_gen(it, step=16, sep='\b'):
    i = j = ''
    while True:
        row = j if j else next(it, None)
        if row is None:
            break
        elif row == '*' or row == '*\n':
            j = next(it, ''.join(i))
            yield from decompress_gen(i, j.split(), step)
        else:
            index = row.find(sep)
            i = row[:index] + row[index + 1:]
            i = split('(\s+)', i)
            j = ''
            yield to_bytes(i, step)


def test(it0, it1, offset=0, start=0, stop=None, step=16, sep='\b'):
    it1, _, nxt = fix(it1, offset, start, stop, step)
    for i in from_hexdump_gen(it0, step=step, sep=sep):
        if mk_row(it1, nxt, start) != i:
            break
        start = nxt
        nxt += step
    else:
        return True


def read_binary_gen(file):
    with open(file, 'rb') as f:
        yield from chain.from_iterable(f)


def write(file, gen):
    with open(file, 'w') as f:
        for i in gen:
            f.write(i + '\n')


def read_gen(file):
    with open(file, 'r') as f:
        yield from f


def write_binary(file, gen):
    with open(file, 'wb') as f:
        for i in gen:
            f.write(i)

Peilonrayz · Accepted Answer · 2017-07-05 09:50:08Z

Don't use a class when you don't need to. A generator is good enough for this, and makes the code much clearer.
Make functions/classes responsible for one thing. HexdumpGenerator changes the iterable and formats.
Use better variable names, self._mod is beyond cryptic. As in this took me a long while to understand, as it's not really modulo, and I can't think of anything else that shortens to mod. How about padding.
Rather than having to care about base_addr and start in your __iter__, you could just change the beginning of the iterable to None, and make it print empty information on None.
Rather than adding variables to your class try to keep them only in function scope. This makes them much easier to work with, and reason with. Plus the only variables you use in child classes are step, col0, delimiter, iterable and _sep. Which don't change after __iter__.

I don't think CompressHexdumpGenerator works correctly:

First, I checked what it outputted:

>>> print('\n'.join(list(CompressHexdumpGenerator(map(ord, '123456781234567812345678'), step=8))))
self._compress
00000000   31 32 33 34  35 36 37 38  12345678
*
00000010   31 32 33 34  35 36 37 38  12345678
>>> print('\n'.join(list(CompressHexdumpGenerator(map(ord, '123456781234567812345679'), step=8))))
00000000   31 32 33 34  35 36 37 38  12345678
*
00000010   31 32 33 34  35 36 37 39  12345679
>>> print('\n'.join(list(CompressHexdumpGenerator(map(ord, '123456781234567912345679'), step=8))))
00000000   31 32 33 34  35 36 37 38  12345678
00000008   31 32 33 34  35 36 37 39  12345679
*
00000010   31 32 33 34  35 36 37 39  12345679

The first and second kind of make sense. I don't really get why you'd care about the very last, but none of the ones in-between. Maybe so you have the last index, IDK. But the last example from above makes no sense, why should it output *? It's just a waist of a line and makes me go 'huh, nothing's there'.

If I'm reading things correctly, you don't need _compress if you move self.row = i out of the if else statement, since all you change is the index to that of the last.

And so ignoring the possible CompressHexdumpGenerator bug, I'd change your code to heavily use itertools, rather than use some odd mutation stuff:

import itertools

def byte_gen(iterable, chunk=1024):
    while True:
        ret = bytearray(islice(iterable, chunk))
        if not ret:
            break
        yield from ret


def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return itertools.zip_longest(*args, fillvalue=fillvalue)


def hex_iter_changer(iterable, base_addr=0, start=0, stop=None, chunk=1024):
    sliced = itertools.islice(iterable, start, stop)
    byte_slice = byte_gen(sliced)
    empty_prefix = itertools.repeat(None, base_addr + start)
    return itertools.chain(empty_prefix, byte_slice)


def _hexdump_generator(iterable, base_addr=0, start=0, stop=None, width=16, sep='\b'):
    iterable = hex_iter_changer(iterable, base_addr, start, stop)
    groups = grouper(iterable, width)
    for index, group in zip(itertools.count(0, width), groups):
        group = list(group)
        yield (
            index,
            [
                None if byte is None else format(byte, '02X')
                for byte in group
            ],
            [
                None if byte is None else chr(byte)
                for byte in group
            ]
        )


def hexdump_generator(iterable, base_addr=0, start=0, stop=None, width=16, sep='\b'):
    middle = width // 2
    for index, hex_values, values in _hexdump_generator(iterable, base_addr, start, stop, width, sep):
        hex_values = ['  ' if byte is None else byte for byte in hex_values]
        hex_values.insert(middle, sep)
        values = [' ' if char is None else char for char in values]
        yield '{:08X}   {}  {}'.format(index, ' '.join(hex_values), ''.join(values))


def compress_hexdump_generator(iterable, base_addr=0, start=0, stop=None, width=16, sep='\b'):
    prev = ''
    duplicates = 0
    for row in hexdump_generator(iterable, base_addr, start, stop, width, sep):
        if prev.split()[1:] == row.split()[1:]:
            if not duplicates:
                yield '*'
            duplicates += 1
        else:
            yield row
            duplicates = 0
        prev = row
    if duplicates:
        yield prev

You are right, and your end result looks pretty good. I should've looked into _compress more recently. I wanted the last row for undoing. Without it, I could not restore a binary file. Thanks for pointing it out. — user133955
– user133955, Commented Jul 5, 2017 at 10:27

Stack Exchange Network

Python 3.x Hexdump

1 Answer 1

You must log in to answer this question.

Hot Network Questions

Python 3.x Hexdump

1 Answer 1

You must log in to answer this question.

Related

Hot Network Questions