Getting information about a RAR file's content

Question

The first lines of the rarpath module (see here for context, the rest is not used here):

import enum
import re
import typing
from pathlib import Path


class RarVersion(enum.IntEnum):
    AMBIGUOUS = 0
    V3 = 3
    V5 = 5

And shared module:

import pathlib


SEVENZIP = pathlib.Path("C:\Program Files\7-Zip\7z.exe")

The rarutils module:

import dataclasses
import enum
import logging
import os
import pathlib
import re
import subprocess
import typing

import rarpath
from shared import SEVENZIP

logger = logging.getLogger(__name__)


class Algo(enum.IntEnum):
    """This class contains the possible hash algorithms."""
    CRC32 = 1
    MD5 = 2
    SHA1 = 3
    SHA256 = 4
    SHA512 = 5


@dataclasses.dataclass
class FileEntry:
    """This class contains information about a file, either on disk or as part of an archive."""
    path: pathlib.Path
    size: int
    is_dir: bool
    hash_value: bytes | None = None
    algo: Algo | None = None


@dataclasses.dataclass
class RarFile:
    """This class contains information about a RAR file."""
    version: rarpath.RarVersion
    path: pathlib.Path
    files: list[FileEntry]
    password: str | None = None

    @classmethod
    def from_path(
        cls, archive_path: pathlib.Path, password: str | None = None
    ) -> typing.Self:
        """Create a RarFile object by reading information from a RAR archive given its path."""
        infos = list_rar(archive_path, password)
        type_entries = [entry for entry in infos if "Type" in entry]
        if not type_entries or len(type_entries) > 1:
            raise ValueError(f"No 'Type' entries found in {archive_path}")
        if type_entries[0]["Type"] == "Rar3":
            version = rarpath.RarVersion.V3
        elif type_entries[0]["Type"] == "Rar5":
            version = rarpath.RarVersion.V5
        else:
            raise ValueError(
                f"Unknown RAR version {type_entries[0]['Type']} in {archive_path}"
            )

        files = []
        for entry in infos:
            if "Path" in entry and "Type" not in entry:
                entry_path = pathlib.Path(entry["Path"])
                size = int(entry["Size"])
                is_dir = entry["Folder"] == "+"
                hash_value = None
                algo = None
                if version == rarpath.RarVersion.V3:
                    # RAR5 hashes the file contents again with their respective mtimes,
                    # so the CRCs in the header are not useful for verification.
                    hash_value = bytes.fromhex(entry["CRC"]) if "CRC" in entry else None
                    algo = Algo.CRC32 if hash_value else None
                files.append(FileEntry(entry_path, size, is_dir, hash_value, algo))
        return cls(version, archive_path, files, password)

    @property
    def hash_values_exist(self) -> bool:
        """Check if *all* files already have hash values."""
        return all(file.hash_value for file in self.files if not file.is_dir)

    def update_hash_values(self):
        """Update the hash values of all files in the archive.
        This will always use the slow method. """
        for entry in self.files:
            if not entry.hash_value and not entry.is_dir:
                try:
                    crc = get_crc32_slow(
                        self.path, entry.path, self.password
                    )  # used for V5, slow
                    entry.hash_value = crc
                except subprocess.CalledProcessError:
                    logger.error(f"Failed to get CRC32 for {entry.path}")

    def pretty_str(self) -> str:
        """Return a pretty string representation of RAR file info and its contents."""
        maxlen_path = max(len(str(file.path)) for file in self.files)
        maxlen_size = max(len(str(file.size)) for file in self.files)
        maxlen_hash = 8 if any(file.hash_value for file in self.files) else 0
        maxlen_algo = 5 if any(file.algo for file in self.files) else 0
        ret = f"RAR file: {self.path}\n"
        ret += f"Version: {self.version.name}\n"
        ret += f"Encrypted: {self.password is not None}\n"
        ret += f"Password: {self.password}\n\n"
        for file in self.files:
            hash_str = file.hash_value.hex() if file.hash_value else ""
            algo_str = file.algo.name if file.algo else ""
            ret += (
                f"  {str(file.path):<{maxlen_path}}  {'D' if file.is_dir else 'F':1} "
                f"{file.size:>{maxlen_size}} {hash_str:<{maxlen_hash}} {algo_str:<{maxlen_algo}}\n"
            )
        return ret


def get_crc32_slow(
    archive_path: pathlib.Path | str,
    entry_path: pathlib.Path | str,
    password: str | None = None,
) -> bytes | None:
    """The slow method to get the CRC32 of a file in a RAR archive.
    7z extracts files internally - this is necessary for RAR5 archives,
    where we can't use the CRCs in the header."""
    command_line = [
        SEVENZIP,
        "t",
        "-scrc",
        "-p" + (password or ""),
        archive_path,
        entry_path,
    ]
    logger.debug(
        f"Processing archive {archive_path} with path {entry_path} using password {password}"
    )

    sub = subprocess.run(command_line, capture_output=True, check=False)

    if sub.returncode != 0:
        logger.error(
            f"Command line {' '.join(map(str, command_line))} returned {sub.returncode}"
        )
        return None

    encoding = os.device_encoding(0)
    if encoding:
        lines = sub.stdout.decode(encoding, errors="ignore").splitlines()
    else:
        raise RuntimeError("Coud not get encoding")

    crc_match = next(
        (
            m.group(1)
            for line in lines
            if (m := re.match(r".*CRC32.*data.*([A-F0-9]{8}).*", line))
        ),
        None,
    )

    if not crc_match:
        logger.debug(f"Failed to get CRC for {archive_path}: {entry_path}")
        return None
    logger.debug(f"Got CRC {crc_match} for {archive_path}: {entry_path}")
    return bytes.fromhex(crc_match)


def list_rar(
    archive: pathlib.Path, password: str | None = None
) -> list[dict[str, str]]:
    """Get an info list about the archive and its contents."""
    logger.debug(f"Listing {archive}, using password {password}")

    command_line = [
        str(SEVENZIP),
        "l",
        "-slt",
        "-p" + (password if password else ""),
        str(archive),
    ]

    sub = subprocess.run(command_line, capture_output=True, check=False)
    if sub.returncode != 0:
        logger.error(
            f"Command line {' '.join(map(str, command_line))} returned {sub.returncode}"
        )
        return []

    encoding = os.device_encoding(0)
    if encoding:
        entries = sub.stdout.decode(encoding, errors="ignore").split(2 * os.linesep)
    else:
        raise RuntimeError("Coud not get encoding")
    ret = []

    for entry in entries:
        lines = entry.splitlines()
        kv_pairs = [line.split("=", 1) for line in lines if "=" in line]
        entry_dict = {k.strip(): v.strip() for k, v in kv_pairs if k.strip()}
        if entry_dict:
            ret.append(entry_dict)

    logger.debug(f"Found {len(ret)} files in {archive}")
    return ret

Some remarks:

The Algo class and the FileEntry class will later be moved to their own module, when support for SFV-files and the like is implemented (right now it's still redundant to have different hash types, since we only deal with CRC32).

For this application, it's essential that we get the standard hashes (that also every utility like rhash delivers) from the RAR file. Which RAR v3 indeed uses in its headers, but not RAR v5, where the mtime of the file is included in the hash.

The main class RarFile is to open a rar file and hold information mainly about its contents, i.e., the file paths, sizes and their hash_values. The slow method for RAR v5 will not be done without explicitly calling update_hash_values.

And sure, a direct call to 7zip library (if it exists) via C-interface may be possible, but just opening the subprocess and reading stdout was obviously easier.

Reinderien · Accepted Answer · 2025-01-30 11:59:57Z

When you write

"C:\Program Files\7-Zip\7z.exe"

you should use the r string literal prefix to make it clear to the interpreter that you didn't intend to write escape sequences.

When you define a @dataclass, consider settings slots=True for performance and robustness reasons.

This:

f"  {str(file.path)

does not need to str(); string casts are implied in arguments to an interpolated string.

I discourage this pattern:

sub = subprocess.run(command_line, capture_output=True, check=False)

if sub.returncode != 0:
    logger.error(
        f"Command line {' '.join(map(str, command_line))} returned {sub.returncode}"
    )
    return None

You should check=True within a try. You have a logger (good!) so you can pass exc_info=True during your except. That means you could convert to check_output. You should also pass text=True and not bother about the encoding stuff.

For logger calls like this:

logger.debug(f"Listing {archive}, using password {password}")

Don't do string interpolation. Use the logger's own variadic-argument percent-style, as in

logger.debug('Listing %s, using password %s', archive, password)

AFAIK using a property for size isn't possible. The paths in the FileEntry aren't necessarily paths of real files, but only the entries of the RarFile (where the size was extracted). So I wonder if it is more correct to use PurePath here, since Path is supposed to be for concrete paths. — Amaterasu
– Amaterasu, Commented Jan 30 at 9:52

toolic · Accepted Answer · 2025-01-30 12:06:31Z

`str`

It is a common practice to use the __str__ function for printing a class. I assume that is the purpose of the pretty_str function in your RarFile class. Consider renaming the function as:

def __str__(self):

Once you've created a handle to the class, you can simply print the handle without using the function name.

Naming

The variable named ret in this function is not too descriptive. I understand it is sort of a placeholder that you are using for the returned value is some of your functions, but I think it would be better to be more specific. For example, in this function, using something like message or text might be more meaningful.

Booboo · Accepted Answer · 2025-01-31 12:55:15Z

In addition to what has already be said, here are a couple suggestions for you to consider:

Encapsulation

Your module has functions get_crc32_slow, list_rar used by your RarFile class and I am wondering why they are not be instance methods of RarFile. Even if you think that these functions might have applicability beyond their use by RarFile, since they are RAR file-related functions, wouldn't making then class methods at the very least be appropriate?

Why `@dataclass`?

The @dataclass decorator is adding many methods to your class that you do not seem to be using. At the same time it seems to me that a client of this class would most likely be always constructing an instance via the from_path class method and you might even want to make that the exclusive way of constructing instances. But with the class defined as is, there is no way of preventing someone from constructing via the __init__ method that @dataclass provides. Assuming you would prefer to discourage a client from directly using __init__, then this would be one way of doing it:

# No @dataclass decorator:
class RarFile:
    def __init__(self):
        raise RuntimeError('Construct this class using class method from_path')


    @classmethod
    def from_path(
        cls, archive_path: pathlib.Path, password: str | None = None
    ) -> typing.Self:
        ...

        #return cls(version, archive_path, files, password)

        instance = cls.__new__(cls)

        instance.version = version
        instance.path = archive_path
        instance.files = files
        instance.password = password

        return instance

And, of course, as suggested by the OP, if we are no longer using the @dataclass decorator we can incorporate the logic of from_path class method into an __init__ constructor and get rid of from_path altogether:

class RarFile:
    def __init__(self, archive_path: pathlib.Path, password: str | None = None) -> typing.Self:
        """Create a RarFile object by reading information from a RAR archive given its path."""
        infos = self.list_rar(archive_path, password)  # Now an instance method
        ...
        self.version = version
        self.path = archive_path
        self.files = files
        self.password = password

yes, get_crc32_slow, list_rar should be put into the RarFile class, don't make much sense outside of it. Regarding the from_path ... it could be an alternative put it all in __init__ (and use a normal class). — viuser
– viuser, Commented Jan 30 at 22:37
Ok, after looking at similar code, I see that it's a bit unusual to put I/O operations in the constructor. There are use cases (e.g., unit tests) where you want to construct the object normally via __init__ and pass the stuff it needs. So it's common to have that and an option to construct it directly via a class method that performs I/O. — viuser
– viuser, Commented Feb 2 at 9:18

Stack Exchange Network

Getting information about a RAR file's content

3 Answers 3

`str`

Naming

Encapsulation

Why `@dataclass`?

You must log in to answer this question.

Linked

Hot Network Questions

Getting information about a RAR file's content

3 Answers 3

__str__

Naming

Encapsulation

Why @dataclass?

You must log in to answer this question.

Linked

Related

Hot Network Questions

`str`

Why `@dataclass`?