Most Python "duplicate file finder" scripts I found do a brute-force of calculating the hashes of all files under a directory. So, I wrote my own -- hopefully faster -- script to kind of do things more intelligently.
Basically, it first searches for files of exact same size, then it compares only N bytes at the head and tail of the files, and finally it compares the files' full hashes.
My #1 concern of course would be correctness, followed closely by maintainability.
__author__ = 'pepoluan'
import os
import hashlib
from glob import glob
from itertools import chain
# Global variables
class G:
    OutFormat = 'list'  # Possible values: 'list', 'csv'
    OutFile = None
    StartPaths = [
        'D:\\DATA_2',
        'D:\\DATA_1',
        'D:\\'
    ]
    PartialCheckSize = 8192
    FullFileHash = True
    MinSize = 16 * 1024 * 1024
    ProgPeriod = 1000
    FullBlockSize = 1024 * 1024
    Quiet = False
    HashFunc = hashlib.md5
def get_walker_generator(at_path):
    return (
        chain.from_iterable(
            glob(
                os.path.join(
                    x[0].replace('[', '[[]').replace(']', '[]]'),
                    '*.*'
                )
            ) for x in os.walk(at_path)
        )
    )
def dict_filter_by_len(rawdict, minlen=2):
    assert isinstance(rawdict, dict)
    return {k: v for k, v in rawdict.items() if len(v) >= minlen}
def qprint(*args, **kwargs):
    if not G.Quiet:
        print(*args, **kwargs)
def get_dupes_by_size(path_list):
    qprint('===== Recursively stat-ing {0}'.format(path_list))
    processed = set()
    size_dict = {}
    for statpath in path_list:
        c = 0
        uniq_in_path = 0
        qprint('{0}...'.format(statpath), end='')
        for fname in get_walker_generator(statpath):
            try:
                if c >= G.ProgPeriod:
                    print('.', end='', flush=True)
                    c = 0
                if fname not in processed:
                    c += 1
                    uniq_in_path += 1
                    fstat = os.stat(fname)
                    fsize = fstat.st_size
                    flist = size_dict.get(fsize, set())
                    flist.add(fname)
                    size_dict[fsize] = flist
                    processed.add(fname)
            except:
                print('\nException on ', fname)
                raise
        qprint(uniq_in_path)
    qprint('\nTotal files: ', len(processed))
    dupe_sizes = {(None, sz): list(fset) for sz, fset in size_dict.items() if sz >= G.MinSize and len(fset) > 1}
    qprint('Dupes: ', len(dupe_sizes))
    return dupe_sizes
def refine_dupes_by_partial_hash(dupes_dict, partial_check_size=G.PartialCheckSize, hashfunc=G.HashFunc):
    assert isinstance(dupes_dict, dict)
    qprint('===== Checking hash of first and last {0} bytes ====='.format(partial_check_size))
    qprint('Processing...', end='', flush=True)
    size_and_hashes = {}
    for selector, flist in dupes_dict.items():
        fsize = selector[-1]
        for fname in flist:
            with open(fname, 'rb') as fin:
                hash_front = hashfunc(fin.read(partial_check_size)).hexdigest()
                seek_targ = fsize - G.PartialCheckSize - 1
                if seek_targ > 0:
                    fin.seek(seek_targ)
                    hash_rear = hashfunc(fin.read(partial_check_size)).hexdigest()
                else:
                    hash_rear = hash_front
            # "size" at rear, so a simple print will still result in a nicely-aligned table
            selector = (hash_front, hash_rear, fsize)
            flist = size_and_hashes.get(selector, [])
            flist.append(fname)
            size_and_hashes[selector] = flist
            qprint('.', end='', flush=True)
    dupe_exact = dict_filter_by_len(size_and_hashes)
    qprint('\nDupes: ', len(dupe_exact))
    return dupe_exact
def refine_dupes_by_full_hash(dupes_dict, block_size=G.FullBlockSize, hashfunc=G.HashFunc):
    assert isinstance(dupes_dict, dict)
    qprint('===== Checking full hashes of Dupes')
    qprint('Processing...', end='', flush=True)
    fullhashes = {}
    for selector, flist in dupes_dict.items():
        sz = selector[-1]  # Save size so we can still inform the user of the size
        for fname in flist:
            hasher = hashfunc()
            with open(fname, 'rb') as fin:
                while True:
                    buf = fin.read(block_size)
                    if not buf: break
                    hasher.update(buf)
            # "size" at rear, so a simple print will still result in a nicely-aligned table
            slct = (hasher.hexdigest(), sz)
            flist = fullhashes.get(slct, [])
            flist.append(fname)
            fullhashes[slct] = flist
            qprint('.', end='', flush=True)
    dupe_exact = dict_filter_by_len(fullhashes)
    qprint('\nDupes: ', len(dupe_exact))
    return dupe_exact
def output_results(dupes_dict, out_format=G.OutFormat, out_file=G.OutFile):
    assert isinstance(dupes_dict, dict)
    kiys = [k for k in dupes_dict]
    kiys.sort(key=lambda x: x[-1])
    if out_file is not None:
        qprint('Writing result in "{0}" format to file: {1} ...'.format(out_format, out_file), end='')
    else:
        qprint()
    if out_format == 'list':
        for kiy in kiys:
            flist = dupes_dict[kiy]
            print('-- {0}:'.format(kiy), file=out_file)
            flist.sort()
            for fname in flist:
                print('   {0}'.format(fname), file=out_file)
    elif out_format == 'csv':
        print('"Ord","Selector","FullPath"', file=out_file)
        order = 1
        for kiy in kiys:
            flist = dupes_dict[kiy]
            flist.sort()
            for fname in flist:
                print('"{0}","{1}","{2}"'.format(order, kiy, fname), file=out_file)
                order += 1
    if out_file is not None:
        qprint('done.')
def _main():
    dupes = get_dupes_by_size(G.StartPaths)
    dupes = refine_dupes_by_partial_hash(dupes)
    if G.FullFileHash:
        dupes = refine_dupes_by_full_hash(dupes)
    output_results(dupes, out_format=G.OutFormat, out_file=G.OutFile)
if __name__ == '__main__':
    _main()

