Find duplicate files using Python

Question

Most Python "duplicate file finder" scripts I found do a brute-force of calculating the hashes of all files under a directory. So, I wrote my own -- hopefully faster -- script to kind of do things more intelligently.

Basically, it first searches for files of exact same size, then it compares only N bytes at the head and tail of the files, and finally it compares the files' full hashes.

My #1 concern of course would be correctness, followed closely by maintainability.

__author__ = 'pepoluan'

import os
import hashlib
from glob import glob
from itertools import chain


# Global variables
class G:
    OutFormat = 'list'  # Possible values: 'list', 'csv'
    OutFile = None
    StartPaths = [
        'D:\\DATA_2',
        'D:\\DATA_1',
        'D:\\'
    ]
    PartialCheckSize = 8192
    FullFileHash = True

    MinSize = 16 * 1024 * 1024
    ProgPeriod = 1000
    FullBlockSize = 1024 * 1024
    Quiet = False

    HashFunc = hashlib.md5


def get_walker_generator(at_path):
    return (
        chain.from_iterable(
            glob(
                os.path.join(
                    x[0].replace('[', '[[]').replace(']', '[]]'),
                    '*.*'
                )
            ) for x in os.walk(at_path)
        )
    )


def dict_filter_by_len(rawdict, minlen=2):
    assert isinstance(rawdict, dict)
    return {k: v for k, v in rawdict.items() if len(v) >= minlen}


def qprint(*args, **kwargs):
    if not G.Quiet:
        print(*args, **kwargs)


def get_dupes_by_size(path_list):
    qprint('===== Recursively stat-ing {0}'.format(path_list))
    processed = set()
    size_dict = {}
    for statpath in path_list:
        c = 0
        uniq_in_path = 0
        qprint('{0}...'.format(statpath), end='')
        for fname in get_walker_generator(statpath):
            try:
                if c >= G.ProgPeriod:
                    print('.', end='', flush=True)
                    c = 0
                if fname not in processed:
                    c += 1
                    uniq_in_path += 1
                    fstat = os.stat(fname)
                    fsize = fstat.st_size
                    flist = size_dict.get(fsize, set())
                    flist.add(fname)
                    size_dict[fsize] = flist
                    processed.add(fname)
            except:
                print('\nException on ', fname)
                raise
        qprint(uniq_in_path)
    qprint('\nTotal files: ', len(processed))
    dupe_sizes = {(None, sz): list(fset) for sz, fset in size_dict.items() if sz >= G.MinSize and len(fset) > 1}
    qprint('Dupes: ', len(dupe_sizes))
    return dupe_sizes


def refine_dupes_by_partial_hash(dupes_dict, partial_check_size=G.PartialCheckSize, hashfunc=G.HashFunc):
    assert isinstance(dupes_dict, dict)
    qprint('===== Checking hash of first and last {0} bytes ====='.format(partial_check_size))
    qprint('Processing...', end='', flush=True)
    size_and_hashes = {}
    for selector, flist in dupes_dict.items():
        fsize = selector[-1]
        for fname in flist:
            with open(fname, 'rb') as fin:
                hash_front = hashfunc(fin.read(partial_check_size)).hexdigest()
                seek_targ = fsize - G.PartialCheckSize - 1
                if seek_targ > 0:
                    fin.seek(seek_targ)
                    hash_rear = hashfunc(fin.read(partial_check_size)).hexdigest()
                else:
                    hash_rear = hash_front
            # "size" at rear, so a simple print will still result in a nicely-aligned table
            selector = (hash_front, hash_rear, fsize)
            flist = size_and_hashes.get(selector, [])
            flist.append(fname)
            size_and_hashes[selector] = flist
            qprint('.', end='', flush=True)
    dupe_exact = dict_filter_by_len(size_and_hashes)
    qprint('\nDupes: ', len(dupe_exact))
    return dupe_exact


def refine_dupes_by_full_hash(dupes_dict, block_size=G.FullBlockSize, hashfunc=G.HashFunc):
    assert isinstance(dupes_dict, dict)
    qprint('===== Checking full hashes of Dupes')
    qprint('Processing...', end='', flush=True)
    fullhashes = {}
    for selector, flist in dupes_dict.items():
        sz = selector[-1]  # Save size so we can still inform the user of the size
        for fname in flist:
            hasher = hashfunc()
            with open(fname, 'rb') as fin:
                while True:
                    buf = fin.read(block_size)
                    if not buf: break
                    hasher.update(buf)
            # "size" at rear, so a simple print will still result in a nicely-aligned table
            slct = (hasher.hexdigest(), sz)
            flist = fullhashes.get(slct, [])
            flist.append(fname)
            fullhashes[slct] = flist
            qprint('.', end='', flush=True)
    dupe_exact = dict_filter_by_len(fullhashes)
    qprint('\nDupes: ', len(dupe_exact))
    return dupe_exact


def output_results(dupes_dict, out_format=G.OutFormat, out_file=G.OutFile):
    assert isinstance(dupes_dict, dict)
    kiys = [k for k in dupes_dict]
    kiys.sort(key=lambda x: x[-1])

    if out_file is not None:
        qprint('Writing result in "{0}" format to file: {1} ...'.format(out_format, out_file), end='')
    else:
        qprint()

    if out_format == 'list':
        for kiy in kiys:
            flist = dupes_dict[kiy]
            print('-- {0}:'.format(kiy), file=out_file)
            flist.sort()
            for fname in flist:
                print('   {0}'.format(fname), file=out_file)
    elif out_format == 'csv':
        print('"Ord","Selector","FullPath"', file=out_file)
        order = 1
        for kiy in kiys:
            flist = dupes_dict[kiy]
            flist.sort()
            for fname in flist:
                print('"{0}","{1}","{2}"'.format(order, kiy, fname), file=out_file)
                order += 1

    if out_file is not None:
        qprint('done.')


def _main():
    dupes = get_dupes_by_size(G.StartPaths)
    dupes = refine_dupes_by_partial_hash(dupes)
    if G.FullFileHash:
        dupes = refine_dupes_by_full_hash(dupes)
    output_results(dupes, out_format=G.OutFormat, out_file=G.OutFile)


if __name__ == '__main__':
    _main()

Hi, please don't edit your question's code after answers. I appreciate you're happy you got the answer you wanted but simply is enough for us :-) I'm not sure on whether or not adding code post ex facto is allowed to demonstrate your finalised solution, but certainly, you should not modify the original code in the question as this invalidates answers. I've rolled-back your code to what it was before you edited your answer for you. — Dan
– Dan, Commented Sep 14, 2015 at 7:58
@ARedHerring ah, okay, sorry about that. Is it okay, though, to upload the end result to a pastebin-like site and edit the original question? (Also, to the best of my recollection, I think I did not edit the original code in the question) — pepoluan
– pepoluan, Commented Sep 14, 2015 at 8:03
Regarding pastebin, I don't know (nor am I convinced you would gain much from this as your question is already marked as answered, but it is your question and probably choice as well). About edit, you definitely did edit the original code in the question :-) You can see in the revision log that your edit replaces the question almost entirely, including the original excerpt of code you provided, which I rolled back. — Dan
– Dan, Commented Sep 14, 2015 at 8:04
Well, there's that line in pale blue that says "182 identical lines skipped", so I think I remembered correctly that I didn't overwrite/edit the original question ;-) ... It's okay. Just in case someone wants to see the Really Final™ version, it's shared here: bitbucket.org/snippets/pepoluan/z9Gkj — pepoluan
– pepoluan, Commented Sep 14, 2015 at 8:23
Correct you are, I apologise... I would not have known that unless I clicked side-by-side, neither would the other person who rollbacked. Very confusing. — Dan
– Dan, Commented Sep 14, 2015 at 8:51

vnp · Accepted Answer · 2015-09-11 17:23:32Z

Don't Repeat Yourself

The refine_dupes_by_partial_hash is almost identical to refine_dupes_by_full_hash; the only difference is what to be done with a file. Factor the difference out into a hashing_strategy callable:

def refine_dupes(dupes_dict, hashing_strategy, block_size, hashfunc):           
    hashes = {}
    for selector, flist in dupes_dict.items():                          
        fsize = selector[-1]
        for name in flist:
            hasher = hashfunc()                                         
            with open(fname, 'rb') as fin:                              
                result = hashing_strategy(fin, strategy, blocksize, hasher)
                flist = fullhashes.get(result, [])                      
                flist.append(fname)                                     
                hashes[result] = flist                                  
    dupe_exact = dict_filter_by_len(hashes)                             
    return dupe_exact

In fact, I'd go one step further and make hashing_strategy a class to incapsulate block_size and hashfunc.

Use builtins where possible:
```
for order, fname in enumerate(flist, 1):
```
instead of manually incrementing order
Use sys.argv instead of hardcoding paths.

Agree with the use of sys.argv instead of hardcoding. In fact, that's the next on the drawing board :-) I just hardcode them at this moment to focus on the correctness. Anyways, nice tips! I'll certainly consider the 'class-ification' of hashing_strategy. — pepoluan
– pepoluan, Commented Sep 14, 2015 at 3:38
Oh, about incrementing order manually, I did that instead of using enumerate() because flist was actually iterated over a dict's values, and the length of each value in the dict are not guaranteed to be the same. E.g.: {1: ['a', 'b'], 2: ['c', 'd', 'e'], 3: ['f', 'g', 'h', 'i'], 4: ['j', 'k']} — pepoluan
– pepoluan, Commented Sep 14, 2015 at 3:51

SuperBiasedMan · Accepted Answer · 2015-09-11 13:45:35Z

5

To improve maintainability the Python style guide is key: PEP0008

In particular, your global variables should all be uppercase to signify that they're constants. PascalCase is for objects and makes it look like that's what these variables are.

Also you don't need class G. Python's scoping means that if you just declare those variables outside of any class or function they'll be global for this file. And if you import this file as a module you can just reference them with module_name.CONSTANT.

Lastly docstrings are a must for maintainability. You don't explain what your functions do at all, making it harder for others to use or change them. Python has another PEP about docstrings that gives a lot of info about how to make good ones.

answered Sep 11, 2015 at 13:45

SuperBiasedMan

13.5k5 gold badges37 silver badges62 bronze badges

\$\begingroup\$ I encapsulated global vars into a G class to allow modification of those vars without having to pepper the code with global statements. Just a habit I did on other programs that I also do here, even though the functions do not actually modify any of the global vars. \$\endgroup\$

pepoluan
– pepoluan

2015-09-14 03:40:21 +00:00
Commented Sep 14, 2015 at 3:40
\$\begingroup\$ And the docstring, I agree. I actually stripped out the docstrings to make the listing here shorter. In hindsight, not a good thing xP \$\endgroup\$

pepoluan
– pepoluan

2015-09-14 03:41:29 +00:00
Commented Sep 14, 2015 at 3:41
\$\begingroup\$ @pepoluan Ah, the idea of avoiding globals hadn't occurred to me. Smart! I'd just note that somewhere... Maybe you had a docstring for it. XD \$\endgroup\$

SuperBiasedMan
– SuperBiasedMan

2015-09-14 04:03:59 +00:00
Commented Sep 14, 2015 at 4:03

Add a comment |

Stack Exchange Network

Find duplicate files using Python

2 Answers 2

You must log in to answer this question.

Hot Network Questions

Find duplicate files using Python

2 Answers 2

You must log in to answer this question.

Related

Hot Network Questions