Skip to main content
add sorting
Source Link
Reinderien
  • 71.1k
  • 5
  • 76
  • 256

SuggestedValidation Demo

import enum
import re
import typing
from pathlib import Path


class RarVersion(enum.IntEnum):
    AMBIGUOUS = 0
    V3 = 3
    V5 = 5


V3_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.      # suffix separator dot
    (?P<suffix>
        rar |  # first literal 'rar' suffix
        r(?P<index>
            \d\d  # two-digit index
        )
    )
    $  # end
''')

V5_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.part   # beginning of first suffix component
    (?P<index>
        \d+  # at least one digit
    )
    \.       # beginning of last suffix component
    (?P<suffix>
        rar  # last suffix component
    )
    $        # end
''')


def rar_list_verify(paths: typing.Sequence[str | Path]) -> RarVersion:
    if len(paths) == 0:
        # Since there is no non-indexed .rar, this must be interpreted as an "empty V5"
        return RarVersion.V5

    matches = [V5_PAT.match(str(p)) for p in paths]

    if any(m is None for m in matches):
        matches = [V3_PAT.match(str(p)) for p in paths]
        version = RarVersion.V3

        for path, match in zip(paths, matches):
            if match is None:
                raise ValueError(f'"{path}" does not match the version-3 pattern')
    elif len(paths) > 1:
        version = RarVersion.V5
    else:
        version = RarVersion.AMBIGUOUS

    stem = matches[0]['stem']
    for i, match in enumerate(matches[1:], start=1):
        if match['stem'] != stem:
            raise ValueError(f'{paths[i]} has an inconsistent stem')

    actual = {
        int(match['index'])
        for match in matches
        if match['index']
    }

    match version:
        case RarVersion.V3:
            base = 0
            count = len(paths) - 1
        case RarVersion.V5:
            base = 1
            count = len(paths)
        case RarVersion.AMBIGUOUS:
            # It's only possible for this to be a valid V5 if the only index is 1
            if actual == {1}:
                return version
            version = RarVersion.V3
            base = 0
            count = 0

    if version == RarVersion.V3:
        n_unnumbered = sum(
            1 for match in matches
            if match['suffix'] == 'rar'
        )
        if n_unnumbered != 1:
            raise ValueError(f'{n_unnumbered} paths have a non-indexed suffix; must be exactly one')

    if count > 0:
        expected = set(range(base, base + count))
        spurious = actual - expected
        if spurious:
            raise ValueError(
                'The following indices are unexpected: '
                + ', '.join(str(i) for i in spurious)
            )
        missing = expected - actual
        if missing:
            raise ValueError(
                'The following indices are missing: '
                + ', '.join(str(i) for i in missing)
            )

    return version


def test() -> None:
    assert rar_list_verify(
        ('a.part1.rar', 'a.part2.rar')
    ) == RarVersion.V5, 'Simple V5'

    assert rar_list_verify(
        ('a.rar', 'a.r00', 'a.r01')
    ) == RarVersion.V3, 'Simple V3'

    assert rar_list_verify(
        ('a.rar',)
    ) == RarVersion.V3, 'Almost ambiguous but cannot be V5'

    assert rar_list_verify(
        ('a.part1.rar',)
    ) == RarVersion.AMBIGUOUS, 'Actually ambiguous even though it is likely V5'

    assert rar_list_verify(
        ('a.part2.rar',)
    ) == RarVersion.V3, 'Invalid index forces this to be interpreted as V3'

    assert rar_list_verify(()) == RarVersion.V5, 'Empty input is only interpretable as a V5'

    try:
        rar_list_verify(('',))
        raise AssertionError('Bad format')
    except ValueError as e:
        assert str(e) == '"" does not match the version-3 pattern'

    try:
        rar_list_verify(('a.rar', 'b.r00'))
        raise AssertionError('Disparate stems')
    except ValueError as e:
        assert str(e) == 'b.r00 has an inconsistent stem'

    try:
        rar_list_verify(('a.r00', 'a.r01'))
        raise AssertionError('Missing non-indexed suffix')
    except ValueError as e:
        assert str(e) == '0 paths have a non-indexed suffix; must be exactly one'

    try:
        rar_list_verify(('a.rar', 'a.rar'))
        raise AssertionError('Duplicate non-indexed suffixes')
    except ValueError as e:
        assert str(e) == '2 paths have a non-indexed suffix; must be exactly one'

    try:
        rar_list_verify(('a.part0.rar', 'a.part1.rar'))
        raise AssertionError('V5 indexed from wrong base value')
    except ValueError as e:
        assert str(e) == 'The following indices are unexpected: 0'

    try:
        rar_list_verify(('a.part1.rar', 'a.part1.rar'))
        raise AssertionError('V5 missing an index')
    except ValueError as e:
        assert str(e) == 'The following indices are missing: 2'


if __name__ == '__main__':
    test()

Sorting Demo

To sort after parsing and validation, I propose that you expose an intermediate path object that remembers the index. This will better reuse the effort from the parse step.

import enum
import os
import re
import typing
from pathlib import Path


class RarVersion(enum.IntEnum):
    AMBIGUOUS = 0
    V3 = 3
    V5 = 5


V3_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.      # suffix separator dot
    (?P<suffix>
        rar |  # first literal 'rar' suffix
        r(?P<index>
            \d\d  # two-digit index
        )
    )
    $  # end
''')

V5_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.part   # beginning of first suffix component
    (?P<index>
        \d+  # at least one digit
    )
    \.       # beginning of last suffix component
    (?P<suffix>
        rar  # last suffix component
    )
    $        # end
''')


class RARPath(typing.NamedTuple):
    index: int
    path: str
    stem: str
    suffix: str

    @classmethod
    def from_match(cls, match: re.Match) -> typing.Self:
        return cls(
            index=-1 if match['index'] is None else int(match['index']),
            path=match.string,
            stem=match['stem'],
            suffix=match['suffix'],
        )

    def __str__(self) -> str:
        return self.path


def parse_rar_list(paths: typing.Sequence[str | Path]) -> tuple[RarVersion, list[RARPath]]:
    if len(paths) == 0:
        # Since there is no non-indexed .rar, this must be interpreted as an "empty V5"
        return RarVersion.V5, []

    matches = [V5_PAT.match(str(p)) for p in paths]

    if any(m is None for m in matches):
        matches = [V3_PAT.match(str(p)) for p in paths]
        version = RarVersion.V3

        for path, match in zip(paths, matches):
            if match is None:
                raise ValueError(f'"{path}" does not match the version-3 pattern')
    elif len(paths) > 1:
        version = RarVersion.V5
    else:
        version = RarVersion.AMBIGUOUS

    parsed = [RARPath.from_match(match) for match in matches]

    stem = parsed[0].stem
    for match in parsed[1:]:
        if match.stem != stem:
            raise ValueError(f'{match} has an inconsistent stem')

    actual = {match.index for match in parsed}

    match version:
        case RarVersion.V3:
            base = -1
        case RarVersion.V5:
            base = 1
        case RarVersion.AMBIGUOUS:
            # It's only possible for this to be a valid V5 if the only index is 1
            if actual == {1}:
                return version, parsed
            version = RarVersion.V3
            base = -1

            # This started as an ambiguous case where the index might have been part of a V5 suffix.
            # Since we've ruled that out, the actual index set is reinterpreted as the base only (-1).
            actual = {-1}

    if version == RarVersion.V3:
        n_unnumbered = sum(
            1 for match in parsed
            if match.suffix == 'rar'
        )
        if n_unnumbered != 1:
            raise ValueError(f'{n_unnumbered} paths have a non-indexed suffix; must be exactly one')

    expected = set(range(base, base + len(paths)))
    spurious = actual - expected
    if spurious:
        raise ValueError(
            'The following indices are unexpected: '
            + ', '.join(str(i) for i in spurious)
        )
    missing = expected - actual
    if missing:
        raise ValueError(
            'The following indices are missing: '
            + ', '.join(str(i) for i in missing)
        )

    return version, parsed


def rar_sort(paths: typing.Iterable[RARPath]) -> list[RARPath]:
    version, parsed = parse_rar_list(paths)
    return [path.path for path in sorted(parsed)]


def test_parse() -> None:
    assert parse_rar_list(
        ('a.part1.rar', 'a.part2.rar')
    )[0] == RarVersion.V5, 'Simple V5'

    assert parse_rar_list(
        ('a.rar', 'a.r00', 'a.r01')
    )[0] == RarVersion.V3, 'Simple V3'

    assert parse_rar_list(
        ('a.rar',)
    )[0] == RarVersion.V3, 'Almost ambiguous but cannot be V5'

    assert parse_rar_list(
        ('a.part1.rar',)
    )[0] == RarVersion.AMBIGUOUS, 'Actually ambiguous even though it is likely V5'

    assert parse_rar_list(
        ('a.part2.rar',)
    )[0] == RarVersion.V3, 'Invalid index forces this to be interpreted as V3'

    assert parse_rar_list(())[0] == RarVersion.V5, 'Empty input is only interpretable as a V5'

    try:
        parse_rar_list(('',))
        raise AssertionError('Bad format')
    except ValueError as e:
        assert str(e) == '"" does not match the version-3 pattern'

    try:
        parse_rar_list(('a.rar', 'b.r00'))
        raise AssertionError('Disparate stems')
    except ValueError as e:
        assert str(e) == 'b.r00 has an inconsistent stem'

    try:
        parse_rar_list(('a.r00', 'a.r01'))
        raise AssertionError('Missing non-indexed suffix')
    except ValueError as e:
        assert str(e) == '0 paths have a non-indexed suffix; must be exactly one'

    try:
        parse_rar_list(('a.rar', 'a.rar'))
        raise AssertionError('Duplicate non-indexed suffixes')
    except ValueError as e:
        assert str(e) == '2 paths have a non-indexed suffix; must be exactly one'

    try:
        parse_rar_list(('a.part0.rar', 'a.part1.rar'))
        raise AssertionError('V5 indexed from wrong base value')
    except ValueError as e:
        assert str(e) == 'The following indices are unexpected: 0'

    try:
        parse_rar_list(('a.part1.rar', 'a.part1.rar'))
        raise AssertionError('V5 missing an index')
    except ValueError as e:
        assert str(e) == 'The following indices are missing: 2'


def test_sort() -> None:
    assert rar_sort(('a.r00', 'a.rar', 'a.r01')) == [
        'a.rar', 'a.r00', 'a.r01',
    ], 'Simple v3 sort'

    assert rar_sort(('a.part2.rar', 'a.part1.rar')) == [
        'a.part1.rar', 'a.part2.rar',
    ]


if __name__ == '__main__':
    test_parse()
    test_sort()

Suggested

import enum
import re
import typing
from pathlib import Path


class RarVersion(enum.IntEnum):
    AMBIGUOUS = 0
    V3 = 3
    V5 = 5


V3_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.      # suffix separator dot
    (?P<suffix>
        rar |  # first literal 'rar' suffix
        r(?P<index>
            \d\d  # two-digit index
        )
    )
    $  # end
''')

V5_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.part   # beginning of first suffix component
    (?P<index>
        \d+  # at least one digit
    )
    \.       # beginning of last suffix component
    (?P<suffix>
        rar  # last suffix component
    )
    $        # end
''')


def rar_list_verify(paths: typing.Sequence[str | Path]) -> RarVersion:
    if len(paths) == 0:
        # Since there is no non-indexed .rar, this must be interpreted as an "empty V5"
        return RarVersion.V5

    matches = [V5_PAT.match(str(p)) for p in paths]

    if any(m is None for m in matches):
        matches = [V3_PAT.match(str(p)) for p in paths]
        version = RarVersion.V3

        for path, match in zip(paths, matches):
            if match is None:
                raise ValueError(f'"{path}" does not match the version-3 pattern')
    elif len(paths) > 1:
        version = RarVersion.V5
    else:
        version = RarVersion.AMBIGUOUS

    stem = matches[0]['stem']
    for i, match in enumerate(matches[1:], start=1):
        if match['stem'] != stem:
            raise ValueError(f'{paths[i]} has an inconsistent stem')

    actual = {
        int(match['index'])
        for match in matches
        if match['index']
    }

    match version:
        case RarVersion.V3:
            base = 0
            count = len(paths) - 1
        case RarVersion.V5:
            base = 1
            count = len(paths)
        case RarVersion.AMBIGUOUS:
            # It's only possible for this to be a valid V5 if the only index is 1
            if actual == {1}:
                return version
            version = RarVersion.V3
            base = 0
            count = 0

    if version == RarVersion.V3:
        n_unnumbered = sum(
            1 for match in matches
            if match['suffix'] == 'rar'
        )
        if n_unnumbered != 1:
            raise ValueError(f'{n_unnumbered} paths have a non-indexed suffix; must be exactly one')

    if count > 0:
        expected = set(range(base, base + count))
        spurious = actual - expected
        if spurious:
            raise ValueError(
                'The following indices are unexpected: '
                + ', '.join(str(i) for i in spurious)
            )
        missing = expected - actual
        if missing:
            raise ValueError(
                'The following indices are missing: '
                + ', '.join(str(i) for i in missing)
            )

    return version


def test() -> None:
    assert rar_list_verify(
        ('a.part1.rar', 'a.part2.rar')
    ) == RarVersion.V5, 'Simple V5'

    assert rar_list_verify(
        ('a.rar', 'a.r00', 'a.r01')
    ) == RarVersion.V3, 'Simple V3'

    assert rar_list_verify(
        ('a.rar',)
    ) == RarVersion.V3, 'Almost ambiguous but cannot be V5'

    assert rar_list_verify(
        ('a.part1.rar',)
    ) == RarVersion.AMBIGUOUS, 'Actually ambiguous even though it is likely V5'

    assert rar_list_verify(
        ('a.part2.rar',)
    ) == RarVersion.V3, 'Invalid index forces this to be interpreted as V3'

    assert rar_list_verify(()) == RarVersion.V5, 'Empty input is only interpretable as a V5'

    try:
        rar_list_verify(('',))
        raise AssertionError('Bad format')
    except ValueError as e:
        assert str(e) == '"" does not match the version-3 pattern'

    try:
        rar_list_verify(('a.rar', 'b.r00'))
        raise AssertionError('Disparate stems')
    except ValueError as e:
        assert str(e) == 'b.r00 has an inconsistent stem'

    try:
        rar_list_verify(('a.r00', 'a.r01'))
        raise AssertionError('Missing non-indexed suffix')
    except ValueError as e:
        assert str(e) == '0 paths have a non-indexed suffix; must be exactly one'

    try:
        rar_list_verify(('a.rar', 'a.rar'))
        raise AssertionError('Duplicate non-indexed suffixes')
    except ValueError as e:
        assert str(e) == '2 paths have a non-indexed suffix; must be exactly one'

    try:
        rar_list_verify(('a.part0.rar', 'a.part1.rar'))
        raise AssertionError('V5 indexed from wrong base value')
    except ValueError as e:
        assert str(e) == 'The following indices are unexpected: 0'

    try:
        rar_list_verify(('a.part1.rar', 'a.part1.rar'))
        raise AssertionError('V5 missing an index')
    except ValueError as e:
        assert str(e) == 'The following indices are missing: 2'


if __name__ == '__main__':
    test()

Validation Demo

import enum
import re
import typing
from pathlib import Path


class RarVersion(enum.IntEnum):
    AMBIGUOUS = 0
    V3 = 3
    V5 = 5


V3_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.      # suffix separator dot
    (?P<suffix>
        rar |  # first literal 'rar' suffix
        r(?P<index>
            \d\d  # two-digit index
        )
    )
    $  # end
''')

V5_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.part   # beginning of first suffix component
    (?P<index>
        \d+  # at least one digit
    )
    \.       # beginning of last suffix component
    (?P<suffix>
        rar  # last suffix component
    )
    $        # end
''')


def rar_list_verify(paths: typing.Sequence[str | Path]) -> RarVersion:
    if len(paths) == 0:
        # Since there is no non-indexed .rar, this must be interpreted as an "empty V5"
        return RarVersion.V5

    matches = [V5_PAT.match(str(p)) for p in paths]

    if any(m is None for m in matches):
        matches = [V3_PAT.match(str(p)) for p in paths]
        version = RarVersion.V3

        for path, match in zip(paths, matches):
            if match is None:
                raise ValueError(f'"{path}" does not match the version-3 pattern')
    elif len(paths) > 1:
        version = RarVersion.V5
    else:
        version = RarVersion.AMBIGUOUS

    stem = matches[0]['stem']
    for i, match in enumerate(matches[1:], start=1):
        if match['stem'] != stem:
            raise ValueError(f'{paths[i]} has an inconsistent stem')

    actual = {
        int(match['index'])
        for match in matches
        if match['index']
    }

    match version:
        case RarVersion.V3:
            base = 0
            count = len(paths) - 1
        case RarVersion.V5:
            base = 1
            count = len(paths)
        case RarVersion.AMBIGUOUS:
            # It's only possible for this to be a valid V5 if the only index is 1
            if actual == {1}:
                return version
            version = RarVersion.V3
            base = 0
            count = 0

    if version == RarVersion.V3:
        n_unnumbered = sum(
            1 for match in matches
            if match['suffix'] == 'rar'
        )
        if n_unnumbered != 1:
            raise ValueError(f'{n_unnumbered} paths have a non-indexed suffix; must be exactly one')

    if count > 0:
        expected = set(range(base, base + count))
        spurious = actual - expected
        if spurious:
            raise ValueError(
                'The following indices are unexpected: '
                + ', '.join(str(i) for i in spurious)
            )
        missing = expected - actual
        if missing:
            raise ValueError(
                'The following indices are missing: '
                + ', '.join(str(i) for i in missing)
            )

    return version


def test() -> None:
    assert rar_list_verify(
        ('a.part1.rar', 'a.part2.rar')
    ) == RarVersion.V5, 'Simple V5'

    assert rar_list_verify(
        ('a.rar', 'a.r00', 'a.r01')
    ) == RarVersion.V3, 'Simple V3'

    assert rar_list_verify(
        ('a.rar',)
    ) == RarVersion.V3, 'Almost ambiguous but cannot be V5'

    assert rar_list_verify(
        ('a.part1.rar',)
    ) == RarVersion.AMBIGUOUS, 'Actually ambiguous even though it is likely V5'

    assert rar_list_verify(
        ('a.part2.rar',)
    ) == RarVersion.V3, 'Invalid index forces this to be interpreted as V3'

    assert rar_list_verify(()) == RarVersion.V5, 'Empty input is only interpretable as a V5'

    try:
        rar_list_verify(('',))
        raise AssertionError('Bad format')
    except ValueError as e:
        assert str(e) == '"" does not match the version-3 pattern'

    try:
        rar_list_verify(('a.rar', 'b.r00'))
        raise AssertionError('Disparate stems')
    except ValueError as e:
        assert str(e) == 'b.r00 has an inconsistent stem'

    try:
        rar_list_verify(('a.r00', 'a.r01'))
        raise AssertionError('Missing non-indexed suffix')
    except ValueError as e:
        assert str(e) == '0 paths have a non-indexed suffix; must be exactly one'

    try:
        rar_list_verify(('a.rar', 'a.rar'))
        raise AssertionError('Duplicate non-indexed suffixes')
    except ValueError as e:
        assert str(e) == '2 paths have a non-indexed suffix; must be exactly one'

    try:
        rar_list_verify(('a.part0.rar', 'a.part1.rar'))
        raise AssertionError('V5 indexed from wrong base value')
    except ValueError as e:
        assert str(e) == 'The following indices are unexpected: 0'

    try:
        rar_list_verify(('a.part1.rar', 'a.part1.rar'))
        raise AssertionError('V5 missing an index')
    except ValueError as e:
        assert str(e) == 'The following indices are missing: 2'


if __name__ == '__main__':
    test()

Sorting Demo

To sort after parsing and validation, I propose that you expose an intermediate path object that remembers the index. This will better reuse the effort from the parse step.

import enum
import os
import re
import typing
from pathlib import Path


class RarVersion(enum.IntEnum):
    AMBIGUOUS = 0
    V3 = 3
    V5 = 5


V3_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.      # suffix separator dot
    (?P<suffix>
        rar |  # first literal 'rar' suffix
        r(?P<index>
            \d\d  # two-digit index
        )
    )
    $  # end
''')

V5_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.part   # beginning of first suffix component
    (?P<index>
        \d+  # at least one digit
    )
    \.       # beginning of last suffix component
    (?P<suffix>
        rar  # last suffix component
    )
    $        # end
''')


class RARPath(typing.NamedTuple):
    index: int
    path: str
    stem: str
    suffix: str

    @classmethod
    def from_match(cls, match: re.Match) -> typing.Self:
        return cls(
            index=-1 if match['index'] is None else int(match['index']),
            path=match.string,
            stem=match['stem'],
            suffix=match['suffix'],
        )

    def __str__(self) -> str:
        return self.path


def parse_rar_list(paths: typing.Sequence[str | Path]) -> tuple[RarVersion, list[RARPath]]:
    if len(paths) == 0:
        # Since there is no non-indexed .rar, this must be interpreted as an "empty V5"
        return RarVersion.V5, []

    matches = [V5_PAT.match(str(p)) for p in paths]

    if any(m is None for m in matches):
        matches = [V3_PAT.match(str(p)) for p in paths]
        version = RarVersion.V3

        for path, match in zip(paths, matches):
            if match is None:
                raise ValueError(f'"{path}" does not match the version-3 pattern')
    elif len(paths) > 1:
        version = RarVersion.V5
    else:
        version = RarVersion.AMBIGUOUS

    parsed = [RARPath.from_match(match) for match in matches]

    stem = parsed[0].stem
    for match in parsed[1:]:
        if match.stem != stem:
            raise ValueError(f'{match} has an inconsistent stem')

    actual = {match.index for match in parsed}

    match version:
        case RarVersion.V3:
            base = -1
        case RarVersion.V5:
            base = 1
        case RarVersion.AMBIGUOUS:
            # It's only possible for this to be a valid V5 if the only index is 1
            if actual == {1}:
                return version, parsed
            version = RarVersion.V3
            base = -1

            # This started as an ambiguous case where the index might have been part of a V5 suffix.
            # Since we've ruled that out, the actual index set is reinterpreted as the base only (-1).
            actual = {-1}

    if version == RarVersion.V3:
        n_unnumbered = sum(
            1 for match in parsed
            if match.suffix == 'rar'
        )
        if n_unnumbered != 1:
            raise ValueError(f'{n_unnumbered} paths have a non-indexed suffix; must be exactly one')

    expected = set(range(base, base + len(paths)))
    spurious = actual - expected
    if spurious:
        raise ValueError(
            'The following indices are unexpected: '
            + ', '.join(str(i) for i in spurious)
        )
    missing = expected - actual
    if missing:
        raise ValueError(
            'The following indices are missing: '
            + ', '.join(str(i) for i in missing)
        )

    return version, parsed


def rar_sort(paths: typing.Iterable[RARPath]) -> list[RARPath]:
    version, parsed = parse_rar_list(paths)
    return [path.path for path in sorted(parsed)]


def test_parse() -> None:
    assert parse_rar_list(
        ('a.part1.rar', 'a.part2.rar')
    )[0] == RarVersion.V5, 'Simple V5'

    assert parse_rar_list(
        ('a.rar', 'a.r00', 'a.r01')
    )[0] == RarVersion.V3, 'Simple V3'

    assert parse_rar_list(
        ('a.rar',)
    )[0] == RarVersion.V3, 'Almost ambiguous but cannot be V5'

    assert parse_rar_list(
        ('a.part1.rar',)
    )[0] == RarVersion.AMBIGUOUS, 'Actually ambiguous even though it is likely V5'

    assert parse_rar_list(
        ('a.part2.rar',)
    )[0] == RarVersion.V3, 'Invalid index forces this to be interpreted as V3'

    assert parse_rar_list(())[0] == RarVersion.V5, 'Empty input is only interpretable as a V5'

    try:
        parse_rar_list(('',))
        raise AssertionError('Bad format')
    except ValueError as e:
        assert str(e) == '"" does not match the version-3 pattern'

    try:
        parse_rar_list(('a.rar', 'b.r00'))
        raise AssertionError('Disparate stems')
    except ValueError as e:
        assert str(e) == 'b.r00 has an inconsistent stem'

    try:
        parse_rar_list(('a.r00', 'a.r01'))
        raise AssertionError('Missing non-indexed suffix')
    except ValueError as e:
        assert str(e) == '0 paths have a non-indexed suffix; must be exactly one'

    try:
        parse_rar_list(('a.rar', 'a.rar'))
        raise AssertionError('Duplicate non-indexed suffixes')
    except ValueError as e:
        assert str(e) == '2 paths have a non-indexed suffix; must be exactly one'

    try:
        parse_rar_list(('a.part0.rar', 'a.part1.rar'))
        raise AssertionError('V5 indexed from wrong base value')
    except ValueError as e:
        assert str(e) == 'The following indices are unexpected: 0'

    try:
        parse_rar_list(('a.part1.rar', 'a.part1.rar'))
        raise AssertionError('V5 missing an index')
    except ValueError as e:
        assert str(e) == 'The following indices are missing: 2'


def test_sort() -> None:
    assert rar_sort(('a.r00', 'a.rar', 'a.r01')) == [
        'a.rar', 'a.r00', 'a.r01',
    ], 'Simple v3 sort'

    assert rar_sort(('a.part2.rar', 'a.part1.rar')) == [
        'a.part1.rar', 'a.part2.rar',
    ]


if __name__ == '__main__':
    test_parse()
    test_sort()
address OP points
Source Link
Reinderien
  • 71.1k
  • 5
  • 76
  • 256

I know that the code uses a lot of map constructs, but I find it easier to reason with them.

I don't, and to make a (perhaps bold) claim about the "typical programmer", I don't think that they would either. The functional-nesting map/lambda style is difficult to understand and maintain. I discourage this sort of code. It also loses the ability to generate meaningful messages describing the specific path that produces an error.

Also, the lambdas introduce a new scope which IMHO prevents subtle bugs in comparison to list comprehensions

I propose thatI'm not sure what you use regular expressions, among other reasons because a regex will be able to capture and verify a consistent stemmean by new scope in this context, suffix and suffix index - you're not doingbut overall this currentlyseems incorrect. pathlibLambdas have closure scope, so they have access to all of the symbols above them. The parameter is helpfulconstrained to the scope of the lambda, but not helpful enoughthis is no different from variables of iteration in a comprehension which are also discarded outside of the comprehension.

Using regexes might make this marginally easier, but probably not worth it

Well, whether it's easier or not, you're currently not validating enough of the path sequence consistency, and regexes will do a more thorough job; so I do consider them worth it. They will be able to capture and verify a consistent stem, suffix and suffix index. pathlib is helpful but not helpful enough.

There is one thing missing in the verification, that is checking for gaps in the list of files. But this is perhaps not that useful since a missing file may be exactly the last one

Those are two different scenarios. A missing last-file will indeed be impossible to detect without attempting a decompress, but a gap anywhere else will be visible.

import enum
import re
import typing
from pathlib import Path


class RarVersion(enum.IntEnum):
    AMBIGUOUS = 0
    V3 = 3
    V5 = 5


V3_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.      # suffix separator dot
    (?P<suffix>
        rar |  # first literal 'rar' suffix
        r(?P<index>
            \d\d  # two-digit index
        )
    )
    $  # end
''')

V5_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.part   # beginning of first suffix component
    (?P<index>
        \d+  # at least one digit
    )
    \.       # beginning of last suffix component
    (?P<suffix>
        rar  # last suffix component
    )
    $        # end
''')


def rar_list_verify(paths: typing.Sequence[str | Path]) -> RarVersion:
    if len(paths) == 0:
        # Since there is no non-indexed .rar, this must be interpreted as an "empty V5"
        return RarVersion.V5

    matches = [V5_PAT.match(str(p)) for p in paths]

    if any(m is None for m in matches):
        matches = [V3_PAT.match(str(p)) for p in paths]
        version = RarVersion.V3

        for path, match in zip(paths, matches):
            if match is None:
                raise ValueError(f'f'"{path}" does not match the version-3 pattern')
    elif len(paths) > 1:
        version = RarVersion.V5
    else:
        version = RarVersion.AMBIGUOUS

    stem = matches[0]['stem']
    for i, match in enumerate(matches[1:], start=1):
        if match['stem'] != stem:
            raise ValueError(f'{paths[i]} has an inconsistent stem')

    actual = {
        int(match['index'])
        for match in matches
        if match['index']
    }

    match version:
        case RarVersion.V3:
            base = 0
            count = len(paths) - 1
        case RarVersion.V5:
            base = 1
            count = len(paths)
        case RarVersion.AMBIGUOUS:
            # It's only possible for this to be a valid V5 if the only index is 1
            if actual == {1}:
                return version
            version = RarVersion.V3
            base = 0
            count = 0

    if version == RarVersion.V3:
        n_unnumbered = sum(
            1 for match in matches
            if match['suffix'] == 'rar'
        )
        if n_unnumbered != 1:
            raise ValueError(f'{n_unnumbered} paths have a non-indexed suffix; must be exactly one')

    if count > 0:
        expected = set(range(base, base + count))
        spurious = actual - expected
        if spurious:
            raise ValueError(
                'The following indices are unexpected: '
                + ', '.join(str(i) for i in spurious)
            )
        missing = expected - actual
        if missing:
            raise ValueError(
                'The following indices are missing: '
                + ', '.join(str(i) for i in missing)
            )

    return version


def test() -> None:
    assert rar_list_verify(
        ('a.part1.rar', 'a.part2.rar')
    ) == RarVersion.V5, 'Simple V5'

    assert rar_list_verify(
        ('a.rar', 'a.r00', 'a.r01')
    ) == RarVersion.V3, 'Simple V3'

    assert rar_list_verify(
        ('a.rar',)
    ) == RarVersion.V3, 'Almost ambiguous but cannot be V5'

    assert rar_list_verify(
        ('a.part1.rar',)
    ) == RarVersion.AMBIGUOUS, 'Actually ambiguous even though it is likely V5'

    assert rar_list_verify(
        ('a.part2.rar',)
    ) == RarVersion.V3, 'Invalid index forces this to be interpreted as V3'

    assert rar_list_verify(()) == RarVersion.V5, 'Empty input is only interpretable as a V5'

    try:
        rar_list_verify(('',))
        raise AssertionError('Bad format')
    except ValueError as e:
        assert str(e) == '"" does not match the version-3 pattern'

    try:
        rar_list_verify(('a.rar', 'b.r00'))
        raise AssertionError('Disparate stems')
    except ValueError as e:
        assert str(e) == 'b.r00 has an inconsistent stem'

    try:
        rar_list_verify(('a.r00', 'a.r01'))
        raise AssertionError('Missing non-indexed suffix')
    except ValueError as e:
        assert str(e) == '0 paths have a non-indexed suffix; must be exactly one'

    try:
        rar_list_verify(('a.rar', 'a.rar'))
        raise AssertionError('Duplicate non-indexed suffixes')
    except ValueError as e:
        assert str(e) == '2 paths have a non-indexed suffix; must be exactly one'

    try:
        rar_list_verify(('a.part0.rar', 'a.part1.rar'))
        raise AssertionError('V5 indexed from wrong base value')
    except ValueError as e:
        assert str(e) == 'The following indices are unexpected: 0'

    try:
        rar_list_verify(('a.part1.rar', 'a.part1.rar'))
        raise AssertionError('V5 missing an index')
    except ValueError as e:
        assert str(e) == 'The following indices are missing: 2'


if __name__ == '__main__':
    test()

The functional-nesting map/lambda style is difficult to understand and maintain. I discourage this sort of code. It also loses the ability to generate meaningful messages describing the specific path that produces an error.

I propose that you use regular expressions, among other reasons because a regex will be able to capture and verify a consistent stem, suffix and suffix index - you're not doing this currently. pathlib is helpful but not helpful enough.

import enum
import re
import typing
from pathlib import Path


class RarVersion(enum.IntEnum):
    AMBIGUOUS = 0
    V3 = 3
    V5 = 5


V3_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.      # suffix separator dot
    (?P<suffix>
        rar |  # first literal 'rar' suffix
        r(?P<index>
            \d\d  # two-digit index
        )
    )
    $  # end
''')

V5_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.part   # beginning of first suffix component
    (?P<index>
        \d+  # at least one digit
    )
    \.       # beginning of last suffix component
    (?P<suffix>
        rar  # last suffix component
    )
    $        # end
''')


def rar_list_verify(paths: typing.Sequence[str | Path]) -> RarVersion:
    matches = [V5_PAT.match(str(p)) for p in paths]

    if any(m is None for m in matches):
        matches = [V3_PAT.match(str(p)) for p in paths]
        version = RarVersion.V3

        for path, match in zip(paths, matches):
            if match is None:
                raise ValueError(f'{path} does not match the version-3 pattern')
    elif len(paths) > 1:
        version = RarVersion.V5
    else:
        version = RarVersion.AMBIGUOUS

    stem = matches[0]['stem']
    for i, match in enumerate(matches[1:], start=1):
        if match['stem'] != stem:
            raise ValueError(f'{paths[i]} has an inconsistent stem')

    actual = {
        int(match['index'])
        for match in matches
        if match['index']
    }

    match version:
        case RarVersion.V3:
            base = 0
            count = len(paths) - 1
        case RarVersion.V5:
            base = 1
            count = len(paths)
        case RarVersion.AMBIGUOUS:
            # It's only possible for this to be a valid V5 if the only index is 1
            if actual == {1}:
                return version
            version = RarVersion.V3
            base = 0
            count = 0

    if version == RarVersion.V3:
        n_unnumbered = sum(
            1 for match in matches
            if match['suffix'] == 'rar'
        )
        if n_unnumbered != 1:
            raise ValueError(f'{n_unnumbered} paths have a non-indexed suffix; must be exactly one')

    if count > 0:
        expected = set(range(base, base + count))
        spurious = actual - expected
        if spurious:
            raise ValueError(
                'The following indices are unexpected: '
                + ', '.join(str(i) for i in spurious)
            )
        missing = expected - actual
        if missing:
            raise ValueError(
                'The following indices are missing: '
                + ', '.join(str(i) for i in missing)
            )

    return version


def test() -> None:
    assert rar_list_verify(
        ('a.part1.rar', 'a.part2.rar')
    ) == RarVersion.V5, 'Simple V5'

    assert rar_list_verify(
        ('a.rar', 'a.r00', 'a.r01')
    ) == RarVersion.V3, 'Simple V3'

    assert rar_list_verify(
        ('a.rar',)
    ) == RarVersion.V3, 'Almost ambiguous but cannot be V5'

    assert rar_list_verify(
        ('a.part1.rar',)
    ) == RarVersion.AMBIGUOUS, 'Actually ambiguous even though it is likely V5'

    assert rar_list_verify(
        ('a.part2.rar',)
    ) == RarVersion.V3, 'Invalid index forces this to be interpreted as V3'


if __name__ == '__main__':
    test()

I know that the code uses a lot of map constructs, but I find it easier to reason with them.

I don't, and to make a (perhaps bold) claim about the "typical programmer", I don't think that they would either. The functional-nesting map/lambda style is difficult to understand and maintain. I discourage this sort of code. It also loses the ability to generate meaningful messages describing the specific path that produces an error.

Also, the lambdas introduce a new scope which IMHO prevents subtle bugs in comparison to list comprehensions

I'm not sure what you mean by new scope in this context, but overall this seems incorrect. Lambdas have closure scope, so they have access to all of the symbols above them. The parameter is constrained to the scope of the lambda, but this is no different from variables of iteration in a comprehension which are also discarded outside of the comprehension.

Using regexes might make this marginally easier, but probably not worth it

Well, whether it's easier or not, you're currently not validating enough of the path sequence consistency, and regexes will do a more thorough job; so I do consider them worth it. They will be able to capture and verify a consistent stem, suffix and suffix index. pathlib is helpful but not helpful enough.

There is one thing missing in the verification, that is checking for gaps in the list of files. But this is perhaps not that useful since a missing file may be exactly the last one

Those are two different scenarios. A missing last-file will indeed be impossible to detect without attempting a decompress, but a gap anywhere else will be visible.

import enum
import re
import typing
from pathlib import Path


class RarVersion(enum.IntEnum):
    AMBIGUOUS = 0
    V3 = 3
    V5 = 5


V3_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.      # suffix separator dot
    (?P<suffix>
        rar |  # first literal 'rar' suffix
        r(?P<index>
            \d\d  # two-digit index
        )
    )
    $  # end
''')

V5_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.part   # beginning of first suffix component
    (?P<index>
        \d+  # at least one digit
    )
    \.       # beginning of last suffix component
    (?P<suffix>
        rar  # last suffix component
    )
    $        # end
''')


def rar_list_verify(paths: typing.Sequence[str | Path]) -> RarVersion:
    if len(paths) == 0:
        # Since there is no non-indexed .rar, this must be interpreted as an "empty V5"
        return RarVersion.V5

    matches = [V5_PAT.match(str(p)) for p in paths]

    if any(m is None for m in matches):
        matches = [V3_PAT.match(str(p)) for p in paths]
        version = RarVersion.V3

        for path, match in zip(paths, matches):
            if match is None:
                raise ValueError(f'"{path}" does not match the version-3 pattern')
    elif len(paths) > 1:
        version = RarVersion.V5
    else:
        version = RarVersion.AMBIGUOUS

    stem = matches[0]['stem']
    for i, match in enumerate(matches[1:], start=1):
        if match['stem'] != stem:
            raise ValueError(f'{paths[i]} has an inconsistent stem')

    actual = {
        int(match['index'])
        for match in matches
        if match['index']
    }

    match version:
        case RarVersion.V3:
            base = 0
            count = len(paths) - 1
        case RarVersion.V5:
            base = 1
            count = len(paths)
        case RarVersion.AMBIGUOUS:
            # It's only possible for this to be a valid V5 if the only index is 1
            if actual == {1}:
                return version
            version = RarVersion.V3
            base = 0
            count = 0

    if version == RarVersion.V3:
        n_unnumbered = sum(
            1 for match in matches
            if match['suffix'] == 'rar'
        )
        if n_unnumbered != 1:
            raise ValueError(f'{n_unnumbered} paths have a non-indexed suffix; must be exactly one')

    if count > 0:
        expected = set(range(base, base + count))
        spurious = actual - expected
        if spurious:
            raise ValueError(
                'The following indices are unexpected: '
                + ', '.join(str(i) for i in spurious)
            )
        missing = expected - actual
        if missing:
            raise ValueError(
                'The following indices are missing: '
                + ', '.join(str(i) for i in missing)
            )

    return version


def test() -> None:
    assert rar_list_verify(
        ('a.part1.rar', 'a.part2.rar')
    ) == RarVersion.V5, 'Simple V5'

    assert rar_list_verify(
        ('a.rar', 'a.r00', 'a.r01')
    ) == RarVersion.V3, 'Simple V3'

    assert rar_list_verify(
        ('a.rar',)
    ) == RarVersion.V3, 'Almost ambiguous but cannot be V5'

    assert rar_list_verify(
        ('a.part1.rar',)
    ) == RarVersion.AMBIGUOUS, 'Actually ambiguous even though it is likely V5'

    assert rar_list_verify(
        ('a.part2.rar',)
    ) == RarVersion.V3, 'Invalid index forces this to be interpreted as V3'

    assert rar_list_verify(()) == RarVersion.V5, 'Empty input is only interpretable as a V5'

    try:
        rar_list_verify(('',))
        raise AssertionError('Bad format')
    except ValueError as e:
        assert str(e) == '"" does not match the version-3 pattern'

    try:
        rar_list_verify(('a.rar', 'b.r00'))
        raise AssertionError('Disparate stems')
    except ValueError as e:
        assert str(e) == 'b.r00 has an inconsistent stem'

    try:
        rar_list_verify(('a.r00', 'a.r01'))
        raise AssertionError('Missing non-indexed suffix')
    except ValueError as e:
        assert str(e) == '0 paths have a non-indexed suffix; must be exactly one'

    try:
        rar_list_verify(('a.rar', 'a.rar'))
        raise AssertionError('Duplicate non-indexed suffixes')
    except ValueError as e:
        assert str(e) == '2 paths have a non-indexed suffix; must be exactly one'

    try:
        rar_list_verify(('a.part0.rar', 'a.part1.rar'))
        raise AssertionError('V5 indexed from wrong base value')
    except ValueError as e:
        assert str(e) == 'The following indices are unexpected: 0'

    try:
        rar_list_verify(('a.part1.rar', 'a.part1.rar'))
        raise AssertionError('V5 missing an index')
    except ValueError as e:
        assert str(e) == 'The following indices are missing: 2'


if __name__ == '__main__':
    test()
bugfix for ambiguous case
Source Link
Reinderien
  • 71.1k
  • 5
  • 76
  • 256
import enum
import re
import typing
from pathlib import Path


class RarVersion(enum.IntEnum):
    AMBIGUOUS = 0
    V3 = 3
    V5 = 5


V3_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.      # suffix separator dot
    (?P<suffix>
        rar |  # first literal 'rar' suffix
        r(?P<index>
            \d\d  # two-digit index
        )
    )
    $  # end
''')

V5_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.
    (?P<suffix>
        part   # beginning of first suffix component
        (?P<index>
            \d+  # at least one digit
    )
    )\.       # beginning of last suffix component
    (?P<suffix>
    \.    rar  # last suffix component
    )
    $        # end
''')


def rar_list_verify(paths: typing.Sequence[str | Path]) -> RarVersion:
    matches = [V5_PAT.match(str(p)) for p in paths]

    if any(m is None for m in matches):
        matches = [V3_PAT.match(str(p)) for p in paths]
        version = RarVersion.V3

        for path, match in zip(paths, matches):
            if match is None:
                raise ValueError(f'{path} does not match the version-3 pattern')
    elif len(paths) > 1:
        version = RarVersion.V5
    else:
        version = RarVersion.AMBIGUOUS

    stem = matches[0]['stem']
    for i, match in enumerate(matches[1:], start=1):
        if match['stem'] != stem:
            raise ValueError(f'{paths[i]} has an inconsistent stem')

    actual = {
        int(match['index'])
        for match in matches
        if match['index']
    }

    match version:
        case RarVersion.V3:
            base = 0
            count = len(paths) - 1
        case RarVersion.V5:
            base = 1
            count = len(paths)
        case RarVersion.AMBIGUOUS:
            # It's only possible for this to be a valid V5 if the only index is 1
            if actual == {1}:
                return version
            version = RarVersion.V3
            base = 0
            count = 0

    if version == RarVersion.V3:
        n_unnumbered = sum(
            1 for match in matches
            if match['suffix'] == 'rar'
        )
        if n_unnumbered != 1:
            raise ValueError(f'{n_unnumbered} paths have a non-indexed suffix; must be exactly one') 

    if count > 0:
        expected = set(range(base, base + count))
        spurious = actual - expected
        if spurious:
            raise ValueError(
                'The following indices are unexpected: '
                + ', '.join(str(i) for i in spurious)
            )
        missing = expected - actual
        if missing:
            raise ValueError(
                'The following indices are missing: '
                + ', '.join(str(i) for i in missing)
            )

    return version


def test() -> None:
    assert rar_list_verify(
        ('a.part1.rar', 'a.part2.rar')
    ) == RarVersion.V5, 'Simple V5'

    assert rar_list_verify(
        ('a.rar', 'a.r00', 'a.r01')
    ) == RarVersion.V3, 'Simple V3'

    assert rar_list_verify(
        ('a.rar',)
    ) == RarVersion.V3, 'Almost ambiguous but cannot be V5'

    assert rar_list_verify(
        ('a.part1.rar',)
    ) == RarVersion.AMBIGUOUS, 'Actually ambiguous even though it is likely V5'

    try:assert rar_list_verify(
        rar_list_verify(('a.part2.rar',))
      ) == raiseRarVersion.V3, AssertionError('Invalid index forces this to be interpreted as V3')
    except ValueError as e:
        assert 'paths have a non-indexed suffix' in str(e)


if __name__ == '__main__':
    test()
import enum
import re
import typing
from pathlib import Path


class RarVersion(enum.IntEnum):
    AMBIGUOUS = 0
    V3 = 3
    V5 = 5


V3_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.      # suffix separator dot
    (?P<suffix>
        rar |  # first literal 'rar' suffix
        r(?P<index>
            \d\d  # two-digit index
        )
    )
    $  # end
''')

V5_PAT = re.compile(
r'''(?x)
    ^    # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.
    (?P<suffix>
        part   # beginning of first suffix component
        (?P<index>
            \d+  # at least one digit
        )
        \.rar  # last suffix component
    )
    $      # end
''')


def rar_list_verify(paths: typing.Sequence[str | Path]) -> RarVersion:
    matches = [V5_PAT.match(str(p)) for p in paths]

    if any(m is None for m in matches):
        matches = [V3_PAT.match(str(p)) for p in paths]
        version = RarVersion.V3

        for path, match in zip(paths, matches):
            if match is None:
                raise ValueError(f'{path} does not match the version-3 pattern')
    elif len(paths) > 1:
        version = RarVersion.V5
    else:
        version = RarVersion.AMBIGUOUS

    stem = matches[0]['stem']
    for i, match in enumerate(matches[1:], start=1):
        if match['stem'] != stem:
            raise ValueError(f'{paths[i]} has an inconsistent stem')

    actual = {
        int(match['index'])
        for match in matches
        if match['index']
    }

    match version:
        case RarVersion.V3:
            base = 0
            count = len(paths) - 1
        case RarVersion.V5:
            base = 1
            count = len(paths)
        case RarVersion.AMBIGUOUS:
            # It's only possible for this to be a valid V5 if the only index is 1
            if actual == {1}:
                return version
            version = RarVersion.V3

    if version == RarVersion.V3:
        n_unnumbered = sum(
            1 for match in matches
            if match['suffix'] == 'rar'
        )
        if n_unnumbered != 1:
            raise ValueError(f'{n_unnumbered} paths have a non-indexed suffix; must be exactly one')

    expected = set(range(base, base + count))
    spurious = actual - expected
    if spurious:
        raise ValueError(
            'The following indices are unexpected: '
            + ', '.join(str(i) for i in spurious)
        )
    missing = expected - actual
    if missing:
        raise ValueError(
            'The following indices are missing: '
            + ', '.join(str(i) for i in missing)
        )

    return version


def test() -> None:
    assert rar_list_verify(
        ('a.part1.rar', 'a.part2.rar')
    ) == RarVersion.V5, 'Simple V5'

    assert rar_list_verify(
        ('a.rar', 'a.r00', 'a.r01')
    ) == RarVersion.V3, 'Simple V3'

    assert rar_list_verify(
        ('a.rar',)
    ) == RarVersion.V3, 'Almost ambiguous but cannot be V5'

    assert rar_list_verify(
        ('a.part1.rar',)
    ) == RarVersion.AMBIGUOUS, 'Actually ambiguous even though it is likely V5'

    try:
        rar_list_verify(('a.part2.rar',))
        raise AssertionError('Invalid index forces this to be interpreted as V3')
    except ValueError as e:
        assert 'paths have a non-indexed suffix' in str(e)


if __name__ == '__main__':
    test()
import enum
import re
import typing
from pathlib import Path


class RarVersion(enum.IntEnum):
    AMBIGUOUS = 0
    V3 = 3
    V5 = 5


V3_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.      # suffix separator dot
    (?P<suffix>
        rar |  # first literal 'rar' suffix
        r(?P<index>
            \d\d  # two-digit index
        )
    )
    $  # end
''')

V5_PAT = re.compile(
r'''(?x)
    ^       # start
    (?P<stem>
        .+  # require a stem of at least one character
    )
    \.part   # beginning of first suffix component
    (?P<index>
        \d+  # at least one digit
    )
    \.       # beginning of last suffix component
    (?P<suffix>
        rar  # last suffix component
    )
    $        # end
''')


def rar_list_verify(paths: typing.Sequence[str | Path]) -> RarVersion:
    matches = [V5_PAT.match(str(p)) for p in paths]

    if any(m is None for m in matches):
        matches = [V3_PAT.match(str(p)) for p in paths]
        version = RarVersion.V3

        for path, match in zip(paths, matches):
            if match is None:
                raise ValueError(f'{path} does not match the version-3 pattern')
    elif len(paths) > 1:
        version = RarVersion.V5
    else:
        version = RarVersion.AMBIGUOUS

    stem = matches[0]['stem']
    for i, match in enumerate(matches[1:], start=1):
        if match['stem'] != stem:
            raise ValueError(f'{paths[i]} has an inconsistent stem')

    actual = {
        int(match['index'])
        for match in matches
        if match['index']
    }

    match version:
        case RarVersion.V3:
            base = 0
            count = len(paths) - 1
        case RarVersion.V5:
            base = 1
            count = len(paths)
        case RarVersion.AMBIGUOUS:
            # It's only possible for this to be a valid V5 if the only index is 1
            if actual == {1}:
                return version
            version = RarVersion.V3
            base = 0
            count = 0

    if version == RarVersion.V3:
        n_unnumbered = sum(
            1 for match in matches
            if match['suffix'] == 'rar'
        )
        if n_unnumbered != 1:
            raise ValueError(f'{n_unnumbered} paths have a non-indexed suffix; must be exactly one') 

    if count > 0:
        expected = set(range(base, base + count))
        spurious = actual - expected
        if spurious:
            raise ValueError(
                'The following indices are unexpected: '
                + ', '.join(str(i) for i in spurious)
            )
        missing = expected - actual
        if missing:
            raise ValueError(
                'The following indices are missing: '
                + ', '.join(str(i) for i in missing)
            )

    return version


def test() -> None:
    assert rar_list_verify(
        ('a.part1.rar', 'a.part2.rar')
    ) == RarVersion.V5, 'Simple V5'

    assert rar_list_verify(
        ('a.rar', 'a.r00', 'a.r01')
    ) == RarVersion.V3, 'Simple V3'

    assert rar_list_verify(
        ('a.rar',)
    ) == RarVersion.V3, 'Almost ambiguous but cannot be V5'

    assert rar_list_verify(
        ('a.part1.rar',)
    ) == RarVersion.AMBIGUOUS, 'Actually ambiguous even though it is likely V5'

    assert rar_list_verify(
        ('a.part2.rar',)
    ) == RarVersion.V3, 'Invalid index forces this to be interpreted as V3'


if __name__ == '__main__':
    test()
stem alone is possible with pathlib
Source Link
Reinderien
  • 71.1k
  • 5
  • 76
  • 256
Loading
Source Link
Reinderien
  • 71.1k
  • 5
  • 76
  • 256
Loading