I am trying to recognize a sentence that I have read through optical character recognition software. This code will eventually run on a Raspberry Pi.
I know for certain that it's meant to be one of a few thousand sentences which I have written down in a file (sentencelist.txt), but the character recognition has messed up a few of the words.
For example:
The quick brown fox jumps over the lazy dog
has been read as
He quick broom fax jumps over the lazy dig
I want to compare the incorrect sentence to every sentence in my list, and then figure out which one it's meant to be.
I currently have a working program, but it is just too slow! I have about 10,000 entries in my sentence file, and the whole process takes over a minute.
I initially used the Levenshtein algorithm to compare, but am now using a different algorithm which compares words rather than characters to speed it up.
How can I speed this baby up?
// C source for sentence lookup from OCR string. In main, you can set the distance calculator to be used.
#include "stdafx.h"
#include <iterator>
#include <string>
#include <vector>
#include <list>
#include <iostream>
#include <fstream>
#include <sstream>
#include <algorithm>
using namespace std;
// Levenshtein Distance Function - callable from Main
size_t LevenshteinDistance(const std::string &s1, const std::string &s2)
{
const size_t m(s1.size());
const size_t n(s2.size());
if (m == 0) return n;
if (n == 0) return m;
size_t *costs = new size_t[n + 1];
for (size_t k = 0; k <= n; k++) costs[k] = k;
size_t i = 0;
for (std::string::const_iterator it1 = s1.begin(); it1 != s1.end(); ++it1, ++i)
{
costs[0] = i + 1;
size_t corner = i;
size_t j = 0;
for (std::string::const_iterator it2 = s2.begin(); it2 != s2.end(); ++it2, ++j)
{
size_t upper = costs[j + 1];
if (*it1 == *it2)
{
costs[j + 1] = corner;
}
else
{
size_t t(upper<corner ? upper : corner);
costs[j + 1] = (costs[j]<t ? costs[j] : t) + 1;
}
corner = upper;
}
}
size_t result = costs[n];
delete[] costs;
return result;
}
// edit distance function (Levenshtein Distance for words) - callable from Main
typedef std::vector<std::string> Sentence;
Sentence &split(const std::string &s, char delim, Sentence &elems) {
std::stringstream ss(s);
std::string item;
while (std::getline(ss, item, delim)) {
elems.push_back(item);
}
return elems;
}
Sentence split(const std::string &s, char delim) {
Sentence elems;
split(s, delim, elems);
return elems;
}
unsigned int edit_distance(const Sentence& s1, const Sentence& s2)
{
const std::size_t len1 = s1.size(), len2 = s2.size();
std::vector<std::vector<unsigned int>> d(len1 + 1, std::vector<unsigned int>(len2 + 1));
d[0][0] = 0;
for (unsigned int i = 1; i <= len1; ++i) d[i][0] = i;
for (unsigned int i = 1; i <= len2; ++i) d[0][i] = i;
for (unsigned int i = 1; i <= len1; ++i)
for (unsigned int j = 1; j <= len2; ++j)
{
d[i][j] = std::min(d[i - 1][j] + 1, d[i][j - 1] + 1);
d[i][j] = std::min(d[i][j], d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : 1));
}
return d[len1][len2];
}
int main()
{
// Tesseract outputs sentence data (Q). We call this string s1.
string s1 = "He quick broom fax jumps over the lazy dig";
// String s2 is out comparison string, read from sentence database
string s2;
// Split strings into vectors of sentences
Sentence sen1 = split(s1, ' ');
Sentence sen2;
int i = 0; // Counter
int num = 13309; // Number of entries in sentence file
string sentencevec[13309]; // Vector where each entry is sentence string from file
int distancevec[13309]; // Vector where each entry is distance between s1 and corresponding sentence
//Read from sentence file and populate aforementioned vectors with data
string line;
ifstream myfile("sentencelistt.txt");
if (myfile.is_open())
{
while (getline(myfile, line))
{
string s2 = line;
sen2 = split(s2, ' ');
i++;
distancevec[i - 1] = edit_distance(sen1, sen2); // Here is where you set the distance calculator. Currently set to LevenshteinDistance, but can set to edit
sentencevec[i - 1] = s2;
}
myfile.close();
}
else cout << "Unable to open file";
// Find smallest entry in distancevec
int smallestsentence = 0; // Column of sentencevec corresponding to matched sentence
int smallestdistance = distancevec[0]; // Distance to matched sentence (Column of distancevec corresponding to matched sentence)
for (i = 0; i < num; i++) {
if (distancevec[i] < smallestdistance) {
smallestdistance = distancevec[i];
smallestsentence = i;
}
}
// Print recognized sentence and pause so program doesn't immediately exit
cout << '\n' << '\n' << sentencevec[smallestsentence] << '\n' << '\n';
system("pause");
return 0;
}