Saturday, August 18, 2007

Converting C++ code to html

 The C++ code (yes, it's primitive,I know) I presented in my last
entry was not formatted attractively. So I researched the web for
a C++ to html formatter (in C++). I pretty soon found it in one of
the examples in the boost library regex documentation. After some
modifications,I got it to format the previous C++ code. It looks
nicer now (but it's still novice level). Here's the output:




#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <set>
#include <cctype>

#include <boost/tokenizer.hpp>
#include <boost/regex.hpp>

using namespace std;
using namespace boost;

map<string,int> model;


void toLower(basic_string<char> &s) {
for (basic_string<char>::iterator p = s.begin();
p != s.end(); ++p) {
*p = tolower(*p);
}
}

map<string,int> train_model(string filename) {

ifstream in(filename.c_str());

string tmp;
regex re("[a-zA-Z]+");
map<string,int> model;

while(!in.eof()) {
getline(in, tmp, '\n');
tokenizer<> tok(tmp);
for(tokenizer<>::iterator beg=tok.begin(); beg!=tok.end();++beg){
if (regex_match(*beg, re)) {
string t(*beg);
toLower(t);
model[t]++;
}
}
}
return model;
}

set<string> edit1(string w) {
set<string> edit1set;

int word_len = w.length();
string alphabet("abcdefghijklmnopqrstuvwxyz");

//deletion
for(int i = 0; i < word_len; ++i)
edit1set.insert(w.substr(0,i) + w.substr(i+1,word_len));

//transposition

for(int i = 0; i < word_len-1; ++i)
edit1set.insert(w.substr(0,i) + w[i+1] + w[i] + w.substr(i+2,word_len));

// alteration

for(int i = 0; i < word_len; ++i) {
for(string::iterator si=alphabet.begin(); si != alphabet.end(); ++si) {
edit1set.insert(w.substr(0,i) + *si + w.substr(i+1,word_len));
}
}

//insertion

for(int i = 0; i < word_len+1; ++i) {
for(string::iterator si=alphabet.begin(); si != alphabet.end(); ++si) {
edit1set.insert(w.substr(0,i) + *si + w.substr(i,word_len)); }
}
return edit1set;
}

set<string> known_edit2(string w) {
set<string> ed2;

set<string> ed1 = edit1(w);
for(set<string>::iterator si = ed1.begin(); si != ed1.end(); ++si) {
set<string> tmp = edit1(*si);

for(set<string>::iterator si2 = tmp.begin(); si2 !=tmp.end(); ++si2) {
if(model.find(*si2) != model.end()) ed2.insert(*si2);
}
}
return ed2;
}

set<string> known(set<string> words) {
set<string> known_set;
for(set<string>::const_iterator si = words.begin(); si != words.end(); ++si) {
if(model.find(*si) != model.end()) known_set.insert(*si);
}

return known_set;
}

string correct(string word) {

set<string> candidates;
set<string> w;
w.insert(word);

if((candidates = known(w)).size() == 0) {
if((candidates = known(edit1(word))).size() == 0){
if((candidates = known(known_edit2(word))).size() == 0) {
return word;
}
}
}


int cur_max = 0;
string tmp;
for(set<string>::const_iterator si = candidates.begin(); si !=candidates.end(); ++si) {
if(model[*si] > cur_max) {
cur_max = model[*si];
tmp = *si;
}
}
return tmp;
}

int main(int argc, char** argv) {

model = train_model("big.txt");

string input;
while (1) {
cout << "Testing correct(), enter a word: ";
cin >> input;
cout << "correct(" << input << ")" << " returns " << correct(input) << '\n';
}
}

Labels:

1 Comments:

Anonymous Anonymous said...

Good for people to know.

November 10, 2008 at 11:59 PM  

Post a Comment

Subscribe to Post Comments [Atom]

<< Home