Friday, August 17, 2007

Implementing Peter Norvig's spelling corrector in C++

  Recently I came across Peter Norvig's latest essay on implementing a Bayesian spelling
corrector
in Python. Inspired, I re-implemented something similar in C++, using it as a
chance to revise my very rusty
C++ skills (see below). It's absolutely nowhere near the
elegance of Peter's python code but it seems to
work which is all I can ask for.
Compiles with g++ and boost libraries (including the regex one) with -lboost_regex switch.



#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <set>
#include <cctype>
#include <boost/tokenizer.hpp>
#include <boost/regex.hpp>

using namespace
std;
using namespace
boost;

map<string,int> model;
void
toLower(basic_string<char> &s) {
for
(basic_string<char>::iterator p = s.begin();
p != s.end(); ++p) {
*
p = tolower(*p);
}
}

map<string,int> train_model(string filename) {

ifstream in(filename.c_str());

string tmp;
regex re("[a-zA-Z]+");
map<string,int> model;
while
(!in.eof()) {
getline(in, tmp, '\n');
tokenizer<> tok(tmp);
for
(tokenizer<>::iterator beg=tok.begin(); beg!=tok.end();++beg){
if
(regex_match(*beg, re)) {
string t(*beg);
toLower(t);
model[t]++;
}
}
}

return
model;
}

set<string> edit1(string w) {
set<string> edit1set;
int
word_len = w.length();
string alphabet("abcdefghijklmnopqrstuvwxyz");

//deletion
for(int i = 0; i < word_len; ++i)
edit1set.insert(w.substr(0,i) + w.substr(i+1,word_len));

//transposition
for(int i = 0; i < word_len-1; ++i)
edit1set.insert(w.substr(0,i) + w[i+1] + w[i] + w.substr(i+2,word_len));

// alteration
for(int i = 0; i < word_len; ++i) {
for
(string::iterator si=alphabet.begin(); si != alphabet.end(); ++si) {
edit1set.insert(w.substr(0,i) + *si + w.substr(i+1,word_len));
}
}


//insertion
for(int i = 0; i < word_len+1; ++i) {
for
(string::iterator si=alphabet.begin(); si != alphabet.end(); ++si) {
edit1set.insert(w.substr(0,i) + *si + w.substr(i,word_len)); }
}

return
edit1set;
}


set<string> known_edit2(string w) {
set<string> ed2;

set<string> ed1 = edit1(w);
for
(set<string>::iterator si = ed1.begin(); si != ed1.end(); ++si) {
set<string> tmp = edit1(*si);

for
(set<string>::iterator si2 = tmp.begin(); si2 !=tmp.end(); ++si2) {
if
(model.find(*si2) != model.end()) ed2.insert(*si2);
}
}

return
ed2;
}


set<string> known(set<string> words) {
set<string> known_set;
for
(set<string>::const_iterator si = words.begin(); si != words.end(); ++si) {
if
(model.find(*si) != model.end()) known_set.insert(*si);
}

return
known_set;
}


string correct(string word) {

set<string> candidates;
set<string> w;
w.insert(word);

if
((candidates = known(w)).size() == 0) {
if
((candidates = known(edit1(word))).size() == 0){
if
((candidates = known(known_edit2(word))).size() == 0) {
return
word;
}
}
}


int
cur_max = 0;
string tmp;
for
(set<string>::const_iterator si = candidates.begin(); si !=candidates.end(); ++si) {
if
(model[*si] > cur_max) {
cur_max = model[*si];
tmp = *si;
}
}

return
tmp;
}


int
main(int argc, char** argv) {
model = train_model("big.txt");

string input;
while
(1) {
cout << "Testing correct(), enter a word: ";
cin >> input;
cout << "correct(" << input << ")" << " returns " << correct(input) << '\n';
}
}

0 Comments:

Post a Comment

Subscribe to Post Comments [Atom]

<< Home