Return to Answer

edited body

Source Link

edited Nov 2, 2023 at 2:40

7.9k
1
18
28

def get_metaphone_to_word(in_file="product_catalog.txt"):
    m_to_word = {}
    with open(infile) as fin:
        for line in fin:
            for word in line.lower().split():
                m1, m2 = metaphone(word)
                m_to_word[m1] = word
                m_to_word[m2] = word
    return m_to_word

m_to_word = get_metaphone_to_word()

def fix_product_typos(line):
    out_words = []
    for word in line.lower().split():
        m1, m2 = metaphone(word)
        w1 = m_to_word.get(m1)
        w2 = m_to_word.get(m2)
        if w1w2:
            out_words.append(w1w2)
        elif w2w1:
            out_words.append(w2w1)
        else:
            # We were unable to cleanup a possibly misspelled word.
            # It might be an English word or surname not in our catalog.
            out_words.append(word)

    return " ".join(out_words)

def get_metaphone_to_word(in_file="product_catalog.txt"):
    m_to_word = {}
    with open(infile) as fin:
        for line in fin:
            for word in line.lower().split():
                m1, m2 = metaphone(word)
                m_to_word[m1] = word
                m_to_word[m2] = word
    return m_to_word

m_to_word = get_metaphone_to_word()

def fix_product_typos(line):
    out_words = []
    for word in line.lower().split():
        m1, m2 = metaphone(word)
        w1 = m_to_word.get(m1)
        w2 = m_to_word.get(m2)
        if w1:
            out_words.append(w1)
        elif w2:
            out_words.append(w2)
        else:
            # We were unable to cleanup a possibly misspelled word.
            # It might be an English word or surname not in our catalog.
            out_words.append(word)

    return " ".join(out_words)

def get_metaphone_to_word(in_file="product_catalog.txt"):
    m_to_word = {}
    with open(infile) as fin:
        for line in fin:
            for word in line.lower().split():
                m1, m2 = metaphone(word)
                m_to_word[m1] = word
                m_to_word[m2] = word
    return m_to_word

m_to_word = get_metaphone_to_word()

def fix_product_typos(line):
    out_words = []
    for word in line.lower().split():
        m1, m2 = metaphone(word)
        w1 = m_to_word.get(m1)
        w2 = m_to_word.get(m2)
        if w2:
            out_words.append(w2)
        elif w1:
            out_words.append(w1)
        else:
            # We were unable to cleanup a possibly misspelled word.
            # It might be an English word or surname not in our catalog.
            out_words.append(word)

    return " ".join(out_words)

Source Link

answered Nov 2, 2023 at 2:29

J_H

7.9k
1
18
28

You didn't supply a use case, so I will make one up for you.

def get_metaphone_to_word(in_file="product_catalog.txt"):
    m_to_word = {}
    with open(infile) as fin:
        for line in fin:
            for word in line.lower().split():
                m1, m2 = metaphone(word)
                m_to_word[m1] = word
                m_to_word[m2] = word
    return m_to_word

m_to_word = get_metaphone_to_word()

def fix_product_typos(line):
    out_words = []
    for word in line.lower().split():
        m1, m2 = metaphone(word)
        w1 = m_to_word.get(m1)
        w2 = m_to_word.get(m2)
        if w1:
            out_words.append(w1)
        elif w2:
            out_words.append(w2)
        else:
            # We were unable to cleanup a possibly misspelled word.
            # It might be an English word or surname not in our catalog.
            out_words.append(word)

    return " ".join(out_words)

Apply fix_product_typos() to each line of Slack, email, or other customer input, to coerce the input terms toward a restricted vocabulary, such as widget names found in a product catalog.

Alternatively, one might feed it /usr/share/dict/words in hopes of correcting English language typos.

There's a "last update wins!" aspect to hash collisions, so it will work better with an input file that is sorted, so rare terms appear first, with popular terms near the end.

result = doublemetaphone('cambrillo')
self.assertEquals(result, ('KMPRL', 'KMPR'))

nit: Better to phrase each AAA as a one-liner for these simple tests.

self.assertEquals(doublemetaphone('cambrillo'), ('KMPRL', 'KMPR'))

Nothing wrong with repeated copy-n-paste for a simple test suite like this. But you might also consider iterating through a list:

    for word, expected_metaphones in [
        ...
        ('cambrillo'), ('KMPRL', 'KMPR'),
        ...
    ]:
        self.assertEquals(doublemetaphone(word), expected)