I posted a small hexdump generator function from a program I've been writing not long ago and applied what a reviewer suggested since then. The goal was to lazily hexdump bytes objects (byte strings, binary files without committing to I/O code).
Here is the relevant code (minus docstrings/tests/script), with some description below:
import re
from itertools import islice
class HexdumpGenerator:
def __init__(self, iterable, base_addr=0, start=0, stop=None, step=16, sep='\b'):
self.iterable = islice(iterable, start, stop)
self.base_addr = base_addr
self.start = start
self.stop = stop
self.step = step
self.col0 = '08X'
self.col1 = '02X'
self.fmt = '{} {} {}'
self.placeholder = [' ']
self._sep = sep[0]
self._mod = (base_addr + start) % self.step
self._next = start + self.step - self._mod
def __iter__(self):
while True:
row = bytearray(islice(self.iterable, self._next - self.start))
if not row:
break
col0 = format(self.base_addr + self.start - self._mod, self.col0)
col1 = self._mod * self.placeholder
col2 = self._mod * ' '
for byte in row:
ch = chr(byte)
col1 += [format(byte, self.col1)]
col2 += ch if ch.isprintable() else '.'
self._mod = self.step - len(col1)
col1 += self._mod * self.placeholder
col2 += self._mod * ' '
col1.insert(self.step // 2, self._sep)
yield self.fmt.format(col0, ' '.join(col1), col2)
self.start = self._next
self._next += self.step
class CompressHexdumpGenerator(HexdumpGenerator):
def __init__(self, *args, **kwargs):
super(CompressHexdumpGenerator, self).__init__(*args, **kwargs)
self.row = ''
self.delimiter = ' '
self.duplicates = 0
def _compress(self):
index = self.row.index(self.delimiter)
col0 = int(self.row[:index], 16)
col0 += self.duplicates * self.step
return format(col0, self.col0) + self.row[index:]
def __iter__(self):
for i in super().__iter__():
if self.row.split()[1:] == i.split()[1:]:
if not self.duplicates:
yield '*'
self.duplicates += 1
else:
yield i
self.row = i
self.duplicates = 0
if self.duplicates:
yield self._compress()
class FromHexdumpGenerator(CompressHexdumpGenerator):
def __init__(self, *args, **kwargs):
super(FromHexdumpGenerator, self).__init__(*args, **kwargs)
self.base = 16
self.len = '3'
def get_repr(self, _row):
row = bytearray()
for i in _row[2:self.step * 2 + 1]:
if i.isalnum():
row.append(int(i, self.base))
elif re.match('(\s{' + self.len + ',})', i):
break
return row
def decompress_gen(self, row0, row1):
i = int(row0[0].rstrip(self.delimiter), 16) + self.step
j = int(row1[0].rstrip(self.delimiter), 16)
while not i >= j:
row = format(i, self.col0) + self.delimiter
row = [row.rstrip(' ')] + row0[1:]
yield self.get_repr(row)
i += self.step
def __iter__(self):
i = j = ''
while True:
row = j if j else next(self.iterable, None)
if row is None:
break
elif row == '*' or row == '*\n':
j = next(self.iterable)
yield from self.decompress_gen(i, j.split())
else:
index = row.find(self._sep)
i = row[:index] + row[index + 1:]
i = re.split('(\s+)', i)
j = ''
yield self.get_repr(i)
Utility functions:
from itertools import chain
def read_binary_gen(file):
with open(file, 'rb') as f:
yield from chain.from_iterable(f)
def write(file, gen):
with open(file, 'w') as f:
for i in gen:
f.write(i + '\n')
def read_gen(file):
with open(file, 'r') as f:
yield from f
def write_binary(file, gen):
with open(file, 'wb') as f:
for i in gen:
f.write(i)
read_binary_gen() is meant to be passed to the first two generator classes, while read_gen to the latter, thus not reading a file into memory.
I've tested the code with different formats (03o, 03d): if specifying 03d, then the placeholder attribute must be assigned a list with a single string composed of 3 spaces. Using FromHexdumpGenerator to undo the hexdump would then require that base be assigned the integer 10, and len the number '4' (3 + 1). The col0 attribute must remain hex (as it is an address).
If fmt's first column ends with a colon (as I've seen other programs use), the delimiter must be set to that value.
I struggled with different encodings before figuring out this was a case for bytearray, so if a bytes object can be dumped, it can be undumped.
EDIT
@Peilonrayz I attempted to fix the issues you pointed out and came up with the following (this is the whole module, with some additional functionality I'd been working on):
from itertools import islice, takewhile, tee, chain
from re import match, split
from copy import copy
from colorama import init, Style
tee = tee
init()
COL0 = '08X'
COL1 = '02X'
PAD = ' '
FMT = '{} {} {}'
DLM = ' '
BASE = 16
def change_format(col0, col1, fmt=''):
global COL0, COL1, PAD, FMT, DLM, BASE
COL0 = col0
COL1 = col1
PAD = ' ' * int(col1[:-1])
if fmt:
FMT = fmt
DLM = fmt[fmt.index('}') + 1]
BASE = {'b': 2, 'o': 8, 'd': 10, 'x': 16, 'X': 16}[col1[-1]]
def fix(it, offset, start, stop, step):
n = (offset + start) % step
return islice(it, start, stop), n, start + step - n
def mk_row(it, nxt, start):
return bytearray(islice(it, nxt - start))
def to_hex(i):
return format(i, COL1)
def to_chr(i):
i = chr(i)
return i if i.isprintable() else '.'
def pad_gen(fn, pad, it, n, step):
count = 0
for i in range(n):
yield pad
for i in it:
yield fn(i)
count += 1
while count < step - n:
yield pad
count += 1
def hexdump_gen(it, offset=0, start=0, stop=None, step=16, sep='\b'):
it, n, nxt = fix(it, offset, start, stop, step)
while True:
row = mk_row(it, nxt, start)
if not row:
break
args = row, n, step
col0 = format(offset + start - n, COL0)
col1 = list(pad_gen(to_hex, PAD, *args))
col2 = ''.join(pad_gen(to_chr, ' ', *args))
col1.insert(step // 2, sep)
yield FMT.format(col0, ' '.join(col1), col2)
start = nxt
nxt += step
n = 0
def compress_hexdump_gen(*args, **kwargs):
row = ''
duplicates = 0
for i in hexdump_gen(*args, **kwargs):
if row.split()[1:] == i.split()[1:]:
if not duplicates:
yield '*'
duplicates += 1
else:
yield i
row = i
duplicates = 0
if duplicates > 1:
index = row.index(DLM)
col0 = int(row[:index], 16)
col0 += duplicates * kwargs.get('step', 16)
yield format(col0, COL0) + row[index:]
def predicate(i):
return not match('(\s{3,})', i)
def highlight(row, sep, ba):
for n, i in enumerate(takewhile(predicate, copy(row))):
if ' ' not in i and i != sep:
j = int(i, BASE)
if j in ba or (not ba and chr(j).isprintable()):
row[n] = Style.BRIGHT + i + Style.RESET_ALL
def highlight_hexdump_gen(it, step=16, sep='\b', ba=b''):
ba = bytearray(ba)
index = step * 2 + (1 if not sep else 3)
for i in it:
row0 = split('(\s+)', i)
row1 = row0[2:index]
highlight(row1, sep, ba)
yield ''.join(row0[:2] + row1 + row0[index:])
def to_bytes(row, step):
ba = bytearray()
for i in takewhile(predicate, row[2:step * 2 + 1]):
i = i.replace(Style.BRIGHT, '').replace(Style.RESET_ALL, '')
if i.isalnum():
ba.append(int(i, BASE))
return ba
def decompress_gen(row0, row1, step):
i = int(row0[0].rstrip(DLM), 16) + step
j = int(row1[0].rstrip(DLM), 16)
while not i >= j:
row = [(format(i, COL0) + DLM).rstrip(' ')] + row0[1:]
yield to_bytes(row, step)
i += step
def from_hexdump_gen(it, step=16, sep='\b'):
i = j = ''
while True:
row = j if j else next(it, None)
if row is None:
break
elif row == '*' or row == '*\n':
j = next(it, ''.join(i))
yield from decompress_gen(i, j.split(), step)
else:
index = row.find(sep)
i = row[:index] + row[index + 1:]
i = split('(\s+)', i)
j = ''
yield to_bytes(i, step)
def test(it0, it1, offset=0, start=0, stop=None, step=16, sep='\b'):
it1, _, nxt = fix(it1, offset, start, stop, step)
for i in from_hexdump_gen(it0, step=step, sep=sep):
if mk_row(it1, nxt, start) != i:
break
start = nxt
nxt += step
else:
return True
def read_binary_gen(file):
with open(file, 'rb') as f:
yield from chain.from_iterable(f)
def write(file, gen):
with open(file, 'w') as f:
for i in gen:
f.write(i + '\n')
def read_gen(file):
with open(file, 'r') as f:
yield from f
def write_binary(file, gen):
with open(file, 'wb') as f:
for i in gen:
f.write(i)