Add a comment

Source Link

edited Feb 28, 2019 at 18:04

88.3k
14
104
327

import itertools
import random
import string
import time
import timeit

def make_random_words(min_len, max_len):
    """An infinite generator of words containing min_len to max_len letters"""
    while True:
        length = random.randint(min_len, max_len)
        yield ''.join([random.choice(string.ascii_letters) for n in range(length)])

if __name__ == "__main__":
    # try ten-thousand random words and a long text
    words = list(itertools.islice(make_random_words(5, 15), 10000))
    mobydick = open("moby-dick.txt").read()

    print(timeit.timeit("Main.get_longest_word(mobydick, words)",
                        number=10, timer=time.process_time, globals=globals()))

    print(timeit.timeit("Matcher(words).read(mobydick)",
                        number=10, timer=time.process_time, globals=globals()))

import itertools
import random
import string
import time
import timeit

def make_random_words(min_len, max_len):
    """An infinite generator of words containing min_len to max_len letters"""
    while True:
        length = random.randint(min_len, max_len)
        yield ''.join([random.choice(string.ascii_letters) for n in range(length)])

if __name__ == "__main__":
    words = list(itertools.islice(make_random_words(5, 15), 10000))
    mobydick = open("moby-dick.txt").read()

    print(timeit.timeit("Main.get_longest_word(mobydick, words)",
                        number=10, timer=time.process_time, globals=globals()))

    print(timeit.timeit("Matcher(words).read(mobydick)",
                        number=10, timer=time.process_time, globals=globals()))

import itertools
import random
import string
import time
import timeit

def make_random_words(min_len, max_len):
    """An infinite generator of words containing min_len to max_len letters"""
    while True:
        length = random.randint(min_len, max_len)
        yield ''.join([random.choice(string.ascii_letters) for n in range(length)])

if __name__ == "__main__":
    # try ten-thousand random words and a long text
    words = list(itertools.islice(make_random_words(5, 15), 10000))
    mobydick = open("moby-dick.txt").read()

    print(timeit.timeit("Main.get_longest_word(mobydick, words)",
                        number=10, timer=time.process_time, globals=globals()))

    print(timeit.timeit("Matcher(words).read(mobydick)",
                        number=10, timer=time.process_time, globals=globals()))

added 133 characters in body

Source Link

edited Feb 28, 2019 at 12:16

Toby Speight

88.3k
14
104
327

It should be obvious that this really lends itself to a prefix-tree representation, but we can make it work for plain lists, too.

Here's a quickly knocked up implementation of that idea, with some small improvements (e.g. finish early if we match the longest candidate). Please forgive a C++ coder's approach to Python:

Source Link

answered Feb 28, 2019 at 9:54

Toby Speight

88.3k
14
104
327

Clarify requirements

What result is desired if there's a tie for longest word? Accepting the first of equal-length words is a valid choice, but we'll want to document that we've made a choice where the spec was vague. Similarly, returning None when there are no matches is another valid choice that should be documented.

I'd lean toward returning a collection of results; that allows the caller to decide whether to print one or more, and streamlines the handling of the empty result.

Reduce computational complexity

The code presented tests every candidate against the entire input, in turn. This scales as O(M✕N), where M is the size of the text and N is the number of candidate words. We can reduce that complexity by grouping the words according to the next unseen letter in the word; then walk through the text, examining just the set of words that can be advanced at each step. That scales as O(M+Nn), where Nn is the total length of all the candidate words.

Here's a quickly knocked up implementation of that idea, with some small improvements (e.g. finish early if we match the longest candidate).

from collections import defaultdict

class Word:
    def __init__(self, s):
        self.chars = s
        self.index = 0

    def __repr__(self):
        return self.chars + "(" + str(self.index) + ")"

class Matcher:
    def __init__(self, words):
        self.candidates = defaultdict(list)
        self.longest = 0
        self.results = []
        for w in words:
            self.insert(Word(w))

    def insert(self, word):
        length = len(word.chars)
        if length < self.longest:
            return
        elif word.index < length:
            self.candidates[word.chars[word.index]].append(word)
        elif length == self.longest:
            self.results.append(word.chars)
        elif length > self.longest:
            self.longest = length
            self.results = [word.chars]
            for i in self.candidates.values():
                i[:] = filter(lambda x: len(x.chars) >= length, i)

    def step(self, char):
        words = self.candidates[char]
        del self.candidates[char]
        for w in words:
            w.index += 1
            self.insert(w)
        if not any(self.candidates.values()):
            raise StopIteration

    def read(self, it):
        try:
            for c in it:
                self.step(c)
        except StopIteration as e:
            pass
        return self.results

Testing this on a large problem set gives results around 20✕ faster (after removing the print() from your code, to give a fair comparison):

import itertools
import random
import string
import time
import timeit

def make_random_words(min_len, max_len):
    """An infinite generator of words containing min_len to max_len letters"""
    while True:
        length = random.randint(min_len, max_len)
        yield ''.join([random.choice(string.ascii_letters) for n in range(length)])

if __name__ == "__main__":
    words = list(itertools.islice(make_random_words(5, 15), 10000))
    mobydick = open("moby-dick.txt").read()

    print(timeit.timeit("Main.get_longest_word(mobydick, words)",
                        number=10, timer=time.process_time, globals=globals()))

    print(timeit.timeit("Matcher(words).read(mobydick)",
                        number=10, timer=time.process_time, globals=globals()))

9.290360821999998
0.510936933

These results might be more dramatic if we read directly from the file, as stopping early will have greater benefit in that case.

Stack Exchange Network

Return to Answer

Clarify requirements

Reduce computational complexity