The first lines of the rarpath module (see here for context, the rest is not used here):
import enum
import re
import typing
from pathlib import Path
class RarVersion(enum.IntEnum):
AMBIGUOUS = 0
V3 = 3
V5 = 5
And shared module:
import pathlib
SEVENZIP = pathlib.Path("C:\Program Files\7-Zip\7z.exe")
The rarutils module:
import dataclasses
import enum
import logging
import os
import pathlib
import re
import subprocess
import typing
import rarpath
from shared import SEVENZIP
logger = logging.getLogger(__name__)
class Algo(enum.IntEnum):
"""This class contains the possible hash algorithms."""
CRC32 = 1
MD5 = 2
SHA1 = 3
SHA256 = 4
SHA512 = 5
@dataclasses.dataclass
class FileEntry:
"""This class contains information about a file, either on disk or as part of an archive."""
path: pathlib.Path
size: int
is_dir: bool
hash_value: bytes | None = None
algo: Algo | None = None
@dataclasses.dataclass
class RarFile:
"""This class contains information about a RAR file."""
version: rarpath.RarVersion
path: pathlib.Path
files: list[FileEntry]
password: str | None = None
@classmethod
def from_path(
cls, archive_path: pathlib.Path, password: str | None = None
) -> typing.Self:
"""Create a RarFile object by reading information from a RAR archive given its path."""
infos = list_rar(archive_path, password)
type_entries = [entry for entry in infos if "Type" in entry]
if not type_entries or len(type_entries) > 1:
raise ValueError(f"No 'Type' entries found in {archive_path}")
if type_entries[0]["Type"] == "Rar3":
version = rarpath.RarVersion.V3
elif type_entries[0]["Type"] == "Rar5":
version = rarpath.RarVersion.V5
else:
raise ValueError(
f"Unknown RAR version {type_entries[0]['Type']} in {archive_path}"
)
files = []
for entry in infos:
if "Path" in entry and "Type" not in entry:
entry_path = pathlib.Path(entry["Path"])
size = int(entry["Size"])
is_dir = entry["Folder"] == "+"
hash_value = None
algo = None
if version == rarpath.RarVersion.V3:
# RAR5 hashes the file contents again with their respective mtimes,
# so the CRCs in the header are not useful for verification.
hash_value = bytes.fromhex(entry["CRC"]) if "CRC" in entry else None
algo = Algo.CRC32 if hash_value else None
files.append(FileEntry(entry_path, size, is_dir, hash_value, algo))
return cls(version, archive_path, files, password)
@property
def hash_values_exist(self) -> bool:
"""Check if *all* files already have hash values."""
return all(file.hash_value for file in self.files if not file.is_dir)
def update_hash_values(self):
"""Update the hash values of all files in the archive.
This will always use the slow method. """
for entry in self.files:
if not entry.hash_value and not entry.is_dir:
try:
crc = get_crc32_slow(
self.path, entry.path, self.password
) # used for V5, slow
entry.hash_value = crc
except subprocess.CalledProcessError:
logger.error(f"Failed to get CRC32 for {entry.path}")
def pretty_str(self) -> str:
"""Return a pretty string representation of RAR file info and its contents."""
maxlen_path = max(len(str(file.path)) for file in self.files)
maxlen_size = max(len(str(file.size)) for file in self.files)
maxlen_hash = 8 if any(file.hash_value for file in self.files) else 0
maxlen_algo = 5 if any(file.algo for file in self.files) else 0
ret = f"RAR file: {self.path}\n"
ret += f"Version: {self.version.name}\n"
ret += f"Encrypted: {self.password is not None}\n"
ret += f"Password: {self.password}\n\n"
for file in self.files:
hash_str = file.hash_value.hex() if file.hash_value else ""
algo_str = file.algo.name if file.algo else ""
ret += (
f" {str(file.path):<{maxlen_path}} {'D' if file.is_dir else 'F':1} "
f"{file.size:>{maxlen_size}} {hash_str:<{maxlen_hash}} {algo_str:<{maxlen_algo}}\n"
)
return ret
def get_crc32_slow(
archive_path: pathlib.Path | str,
entry_path: pathlib.Path | str,
password: str | None = None,
) -> bytes | None:
"""The slow method to get the CRC32 of a file in a RAR archive.
7z extracts files internally - this is necessary for RAR5 archives,
where we can't use the CRCs in the header."""
command_line = [
SEVENZIP,
"t",
"-scrc",
"-p" + (password or ""),
archive_path,
entry_path,
]
logger.debug(
f"Processing archive {archive_path} with path {entry_path} using password {password}"
)
sub = subprocess.run(command_line, capture_output=True, check=False)
if sub.returncode != 0:
logger.error(
f"Command line {' '.join(map(str, command_line))} returned {sub.returncode}"
)
return None
encoding = os.device_encoding(0)
if encoding:
lines = sub.stdout.decode(encoding, errors="ignore").splitlines()
else:
raise RuntimeError("Coud not get encoding")
crc_match = next(
(
m.group(1)
for line in lines
if (m := re.match(r".*CRC32.*data.*([A-F0-9]{8}).*", line))
),
None,
)
if not crc_match:
logger.debug(f"Failed to get CRC for {archive_path}: {entry_path}")
return None
logger.debug(f"Got CRC {crc_match} for {archive_path}: {entry_path}")
return bytes.fromhex(crc_match)
def list_rar(
archive: pathlib.Path, password: str | None = None
) -> list[dict[str, str]]:
"""Get an info list about the archive and its contents."""
logger.debug(f"Listing {archive}, using password {password}")
command_line = [
str(SEVENZIP),
"l",
"-slt",
"-p" + (password if password else ""),
str(archive),
]
sub = subprocess.run(command_line, capture_output=True, check=False)
if sub.returncode != 0:
logger.error(
f"Command line {' '.join(map(str, command_line))} returned {sub.returncode}"
)
return []
encoding = os.device_encoding(0)
if encoding:
entries = sub.stdout.decode(encoding, errors="ignore").split(2 * os.linesep)
else:
raise RuntimeError("Coud not get encoding")
ret = []
for entry in entries:
lines = entry.splitlines()
kv_pairs = [line.split("=", 1) for line in lines if "=" in line]
entry_dict = {k.strip(): v.strip() for k, v in kv_pairs if k.strip()}
if entry_dict:
ret.append(entry_dict)
logger.debug(f"Found {len(ret)} files in {archive}")
return ret
Some remarks:
The Algo class and the FileEntry class will later be moved to their own module, when support for SFV-files and the like is implemented (right now it's still redundant to have different hash types, since we only deal with CRC32).
For this application, it's essential that we get the standard hashes (that also every utility like rhash delivers) from the RAR file. Which RAR v3 indeed uses in its headers, but not RAR v5, where the mtime of the file is included in the hash.
The main class RarFile is to open a rar file and hold information mainly about its contents, i.e., the file paths, sizes and their hash_values. The slow method for RAR v5 will not be done without explicitly calling update_hash_values.
And sure, a direct call to 7zip library (if it exists) via C-interface may be possible, but just opening the subprocess and reading stdout was obviously easier.