I know that the code uses a lot of map constructs, but I find it easier to reason with them.
I don't, and to make a (perhaps bold) claim about the "typical programmer", I don't think that they would either. The functional-nesting map/lambda style is difficult to understand and maintain. I discourage this sort of code. It also loses the ability to generate meaningful messages describing the specific path that produces an error.
Also, the lambdas introduce a new scope which IMHO prevents subtle bugs in comparison to list comprehensions
I'm not sure what you mean by new scope in this context, but overall this seems incorrect. Lambdas have closure scope, so they have access to all of the symbols above them. The parameter is constrained to the scope of the lambda, but this is no different from variables of iteration in a comprehension which are also discarded outside of the comprehension.
Using regexes might make this marginally easier, but probably not worth it
Well, whether it's easier or not, you're currently not validating enough of the path sequence consistency, and regexes will do a more thorough job; so I do consider them worth it. They will be able to capture and verify a consistent stem, suffix and suffix index. pathlib is helpful but not helpful enough.
There is one thing missing in the verification, that is checking for gaps in the list of files. But this is perhaps not that useful since a missing file may be exactly the last one
Those are two different scenarios. A missing last-file will indeed be impossible to detect without attempting a decompress, but a gap anywhere else will be visible.
Write unit tests.
Your code would benefit from detecting and reporting the RAR naming variant used.
Validation Demo
import enum
import re
import typing
from pathlib import Path
class RarVersion(enum.IntEnum):
AMBIGUOUS = 0
V3 = 3
V5 = 5
V3_PAT = re.compile(
r'''(?x)
^ # start
(?P<stem>
.+ # require a stem of at least one character
)
\. # suffix separator dot
(?P<suffix>
rar | # first literal 'rar' suffix
r(?P<index>
\d\d # two-digit index
)
)
$ # end
''')
V5_PAT = re.compile(
r'''(?x)
^ # start
(?P<stem>
.+ # require a stem of at least one character
)
\.part # beginning of first suffix component
(?P<index>
\d+ # at least one digit
)
\. # beginning of last suffix component
(?P<suffix>
rar # last suffix component
)
$ # end
''')
def rar_list_verify(paths: typing.Sequence[str | Path]) -> RarVersion:
if len(paths) == 0:
# Since there is no non-indexed .rar, this must be interpreted as an "empty V5"
return RarVersion.V5
matches = [V5_PAT.match(str(p)) for p in paths]
if any(m is None for m in matches):
matches = [V3_PAT.match(str(p)) for p in paths]
version = RarVersion.V3
for path, match in zip(paths, matches):
if match is None:
raise ValueError(f'"{path}" does not match the version-3 pattern')
elif len(paths) > 1:
version = RarVersion.V5
else:
version = RarVersion.AMBIGUOUS
stem = matches[0]['stem']
for i, match in enumerate(matches[1:], start=1):
if match['stem'] != stem:
raise ValueError(f'{paths[i]} has an inconsistent stem')
actual = {
int(match['index'])
for match in matches
if match['index']
}
match version:
case RarVersion.V3:
base = 0
count = len(paths) - 1
case RarVersion.V5:
base = 1
count = len(paths)
case RarVersion.AMBIGUOUS:
# It's only possible for this to be a valid V5 if the only index is 1
if actual == {1}:
return version
version = RarVersion.V3
base = 0
count = 0
if version == RarVersion.V3:
n_unnumbered = sum(
1 for match in matches
if match['suffix'] == 'rar'
)
if n_unnumbered != 1:
raise ValueError(f'{n_unnumbered} paths have a non-indexed suffix; must be exactly one')
if count > 0:
expected = set(range(base, base + count))
spurious = actual - expected
if spurious:
raise ValueError(
'The following indices are unexpected: '
+ ', '.join(str(i) for i in spurious)
)
missing = expected - actual
if missing:
raise ValueError(
'The following indices are missing: '
+ ', '.join(str(i) for i in missing)
)
return version
def test() -> None:
assert rar_list_verify(
('a.part1.rar', 'a.part2.rar')
) == RarVersion.V5, 'Simple V5'
assert rar_list_verify(
('a.rar', 'a.r00', 'a.r01')
) == RarVersion.V3, 'Simple V3'
assert rar_list_verify(
('a.rar',)
) == RarVersion.V3, 'Almost ambiguous but cannot be V5'
assert rar_list_verify(
('a.part1.rar',)
) == RarVersion.AMBIGUOUS, 'Actually ambiguous even though it is likely V5'
assert rar_list_verify(
('a.part2.rar',)
) == RarVersion.V3, 'Invalid index forces this to be interpreted as V3'
assert rar_list_verify(()) == RarVersion.V5, 'Empty input is only interpretable as a V5'
try:
rar_list_verify(('',))
raise AssertionError('Bad format')
except ValueError as e:
assert str(e) == '"" does not match the version-3 pattern'
try:
rar_list_verify(('a.rar', 'b.r00'))
raise AssertionError('Disparate stems')
except ValueError as e:
assert str(e) == 'b.r00 has an inconsistent stem'
try:
rar_list_verify(('a.r00', 'a.r01'))
raise AssertionError('Missing non-indexed suffix')
except ValueError as e:
assert str(e) == '0 paths have a non-indexed suffix; must be exactly one'
try:
rar_list_verify(('a.rar', 'a.rar'))
raise AssertionError('Duplicate non-indexed suffixes')
except ValueError as e:
assert str(e) == '2 paths have a non-indexed suffix; must be exactly one'
try:
rar_list_verify(('a.part0.rar', 'a.part1.rar'))
raise AssertionError('V5 indexed from wrong base value')
except ValueError as e:
assert str(e) == 'The following indices are unexpected: 0'
try:
rar_list_verify(('a.part1.rar', 'a.part1.rar'))
raise AssertionError('V5 missing an index')
except ValueError as e:
assert str(e) == 'The following indices are missing: 2'
if __name__ == '__main__':
test()
Sorting Demo
To sort after parsing and validation, I propose that you expose an intermediate path object that remembers the index. This will better reuse the effort from the parse step.
import enum
import os
import re
import typing
from pathlib import Path
class RarVersion(enum.IntEnum):
AMBIGUOUS = 0
V3 = 3
V5 = 5
V3_PAT = re.compile(
r'''(?x)
^ # start
(?P<stem>
.+ # require a stem of at least one character
)
\. # suffix separator dot
(?P<suffix>
rar | # first literal 'rar' suffix
r(?P<index>
\d\d # two-digit index
)
)
$ # end
''')
V5_PAT = re.compile(
r'''(?x)
^ # start
(?P<stem>
.+ # require a stem of at least one character
)
\.part # beginning of first suffix component
(?P<index>
\d+ # at least one digit
)
\. # beginning of last suffix component
(?P<suffix>
rar # last suffix component
)
$ # end
''')
class RARPath(typing.NamedTuple):
index: int
path: str
stem: str
suffix: str
@classmethod
def from_match(cls, match: re.Match) -> typing.Self:
return cls(
index=-1 if match['index'] is None else int(match['index']),
path=match.string,
stem=match['stem'],
suffix=match['suffix'],
)
def __str__(self) -> str:
return self.path
def parse_rar_list(paths: typing.Sequence[str | Path]) -> tuple[RarVersion, list[RARPath]]:
if len(paths) == 0:
# Since there is no non-indexed .rar, this must be interpreted as an "empty V5"
return RarVersion.V5, []
matches = [V5_PAT.match(str(p)) for p in paths]
if any(m is None for m in matches):
matches = [V3_PAT.match(str(p)) for p in paths]
version = RarVersion.V3
for path, match in zip(paths, matches):
if match is None:
raise ValueError(f'"{path}" does not match the version-3 pattern')
elif len(paths) > 1:
version = RarVersion.V5
else:
version = RarVersion.AMBIGUOUS
parsed = [RARPath.from_match(match) for match in matches]
stem = parsed[0].stem
for match in parsed[1:]:
if match.stem != stem:
raise ValueError(f'{match} has an inconsistent stem')
actual = {match.index for match in parsed}
match version:
case RarVersion.V3:
base = -1
case RarVersion.V5:
base = 1
case RarVersion.AMBIGUOUS:
# It's only possible for this to be a valid V5 if the only index is 1
if actual == {1}:
return version, parsed
version = RarVersion.V3
base = -1
# This started as an ambiguous case where the index might have been part of a V5 suffix.
# Since we've ruled that out, the actual index set is reinterpreted as the base only (-1).
actual = {-1}
if version == RarVersion.V3:
n_unnumbered = sum(
1 for match in parsed
if match.suffix == 'rar'
)
if n_unnumbered != 1:
raise ValueError(f'{n_unnumbered} paths have a non-indexed suffix; must be exactly one')
expected = set(range(base, base + len(paths)))
spurious = actual - expected
if spurious:
raise ValueError(
'The following indices are unexpected: '
+ ', '.join(str(i) for i in spurious)
)
missing = expected - actual
if missing:
raise ValueError(
'The following indices are missing: '
+ ', '.join(str(i) for i in missing)
)
return version, parsed
def rar_sort(paths: typing.Iterable[RARPath]) -> list[RARPath]:
version, parsed = parse_rar_list(paths)
return [path.path for path in sorted(parsed)]
def test_parse() -> None:
assert parse_rar_list(
('a.part1.rar', 'a.part2.rar')
)[0] == RarVersion.V5, 'Simple V5'
assert parse_rar_list(
('a.rar', 'a.r00', 'a.r01')
)[0] == RarVersion.V3, 'Simple V3'
assert parse_rar_list(
('a.rar',)
)[0] == RarVersion.V3, 'Almost ambiguous but cannot be V5'
assert parse_rar_list(
('a.part1.rar',)
)[0] == RarVersion.AMBIGUOUS, 'Actually ambiguous even though it is likely V5'
assert parse_rar_list(
('a.part2.rar',)
)[0] == RarVersion.V3, 'Invalid index forces this to be interpreted as V3'
assert parse_rar_list(())[0] == RarVersion.V5, 'Empty input is only interpretable as a V5'
try:
parse_rar_list(('',))
raise AssertionError('Bad format')
except ValueError as e:
assert str(e) == '"" does not match the version-3 pattern'
try:
parse_rar_list(('a.rar', 'b.r00'))
raise AssertionError('Disparate stems')
except ValueError as e:
assert str(e) == 'b.r00 has an inconsistent stem'
try:
parse_rar_list(('a.r00', 'a.r01'))
raise AssertionError('Missing non-indexed suffix')
except ValueError as e:
assert str(e) == '0 paths have a non-indexed suffix; must be exactly one'
try:
parse_rar_list(('a.rar', 'a.rar'))
raise AssertionError('Duplicate non-indexed suffixes')
except ValueError as e:
assert str(e) == '2 paths have a non-indexed suffix; must be exactly one'
try:
parse_rar_list(('a.part0.rar', 'a.part1.rar'))
raise AssertionError('V5 indexed from wrong base value')
except ValueError as e:
assert str(e) == 'The following indices are unexpected: 0'
try:
parse_rar_list(('a.part1.rar', 'a.part1.rar'))
raise AssertionError('V5 missing an index')
except ValueError as e:
assert str(e) == 'The following indices are missing: 2'
def test_sort() -> None:
assert rar_sort(('a.r00', 'a.rar', 'a.r01')) == [
'a.rar', 'a.r00', 'a.r01',
], 'Simple v3 sort'
assert rar_sort(('a.part2.rar', 'a.part1.rar')) == [
'a.part1.rar', 'a.part2.rar',
]
if __name__ == '__main__':
test_parse()
test_sort()