proj home

Files   Classes   Functions   Hierarchy  

tokenizer.cpp

Go to the documentation of this file.
00001 #include <tokenizer.h>
00002 
00003 #include <fstream>
00004 #include <sstream>
00005 using namespace std;
00006 
00007 
00008 boolc tokenizer::operator ! () const 
00009 { 
00010   return (current != seq.end()); 
00011 }
00012 
00013 string & tokenizer::operator * ()
00014 { 
00015   assert(current != seq.end()); 
00016   return *current; 
00017 } 
00018 
00019 stringc & tokenizer::operator() () const 
00020 { 
00021   assert(current != seq.end()); 
00022   return *current; 
00023 }
00024 
00025 void tokenizer::remove(stringc & token)
00026 { 
00027   seq.remove(token); 
00028 }
00029 
00030 void tokenizer::remove_if()
00031 { 
00032   seq.remove_if(spacerdelete<>()); 
00033 }
00034 
00035 void tokenizer::trim()
00036 { 
00037   apply(spacertrim<>()); 
00038 }
00039 
00040 void tokenizer::trim_and_prune()
00041 { 
00042   trim(); 
00043   remove_if(spacerdelete<>()); 
00044 }
00045 
00046 void tokenizer::read(stringc & data)
00047 { 
00048   seq.push_back(data); 
00049 }
00050 
00051 void tokenizer::readaslines(stringc & data)
00052 { 
00053   seq.push_back(data); subtract("\n"); 
00054 }
00055 
00056 
00057 
00058 boolc tokenizermisc::comparewithoutspace
00059 (
00060   stringc & s1, 
00061   stringc & s2
00062 )
00063 {
00064   tokenizer t1(s1);
00065   t1.atomize(" ");
00066   t1.atomize("\n");
00067   t1.atomize("\t");
00068   t1.trim();
00069   t1.remove_if(spacerdelete<>());
00070 
00071   tokenizer t2(s2);
00072   t2.atomize(" ");
00073   t2.atomize("\n");
00074   t2.atomize("\t");
00075   t2.trim();
00076   t2.remove_if(spacerdelete<>());
00077 
00078   return t1==t2;
00079 }
00080 
00081 boolc tokenizermisc::readfile
00082 (
00083   string & str, 
00084   stringc & fname
00085 )
00086 {
00087   assert(false); // Use stringserialization::serialize(str,filename); 
00088   ifstream file(fname.c_str());
00089   if (!file)
00090     return false;
00091 
00092   stringstream ss;
00093   ss << file;
00094 
00095   str = ss.str();
00096 
00097 
00098 //  char ch;
00099 
00100 /*
00101   if (file.get(ch))
00102   {
00103     str.push_back(ch);
00104     for ( ; file.get(ch); )
00105     {
00106       str.push_back(ch) 
00107     };
00108   }
00109 */
00110 
00111 
00112   //while (file.get(ch)!= EOF)
00113 /*
00114   while ( file.get(ch) )
00115   { 
00116     str.push_back(ch) 
00117   };
00118 */
00119 
00120   return true;
00121 }
00122 
00123 void tokenizer::stripcomment(stringc & comment)
00124 {
00125   liststringi k = seq.begin();
00126   string::size_type i;
00127   for (;k!=seq.end(); ++k)
00128   {
00129     string & token(*k);
00130     i=0;
00131     i = token.find(comment,i);
00132     if (i==string::npos)
00133       continue;
00134 
00135     token.erase(i);
00136   }
00137 }
00138 
00139 
00140 
00141 boolc tokenizer::atomize_next
00142 ( 
00143   stringc & atom 
00144 )
00145 {
00146   liststringi i = current;
00147 
00148   string::size_type k;
00149   
00150   string::size_type const atomlen = atom.length();
00151 
00152   for (;i!=seq.end(); ++i)
00153   {
00154     k = i->find(atom.c_str());
00155 
00156     if (k==string::npos)
00157       continue;
00158 
00159     atomize(i,atom,k);
00160     if (i->length()==atomlen)
00161     {
00162       if (atom==*i)
00163       {
00164         current=i;
00165         return true;
00166       }
00167     }
00168 
00169     ++i;
00170     atomize(i,atom,0);
00171     assert(i->length()==atomlen);
00172     current=i;
00173     return true;
00174   }
00175 
00176   return false; 
00177 }
00178 
00179 
00180 
00181 
00182 
00183 
00184 
00185 boolc tokenizer::atomize_next
00186 ( 
00187   stringc & atom, 
00188   liststringi & iend_
00189 )
00190 {
00191   liststringi i = current;
00192 
00193   string::size_type k;
00194   
00195   string::size_type const atomlen = atom.length();
00196 
00197   for (;i!=iend_; ++i)
00198   {
00199     k = i->find(atom.c_str());
00200 
00201     if (k==string::npos)
00202       continue;
00203 
00204     atomize(i,atom,k);
00205     if (i->length()==atomlen)
00206     {
00207       if (atom==*i)
00208       {
00209         current=i;
00210         return true;
00211       }
00212     }
00213 
00214     ++i;
00215     atomize(i,atom,0);
00216     assert(i->length()==atomlen);
00217     current=i;
00218     return true;
00219   }
00220 
00221   return false; 
00222 }
00223 
00224 
00225 
00226 
00227 
00228 boolc tokenizer::atomize_next
00229 (
00230   liststringi& i1,
00231   liststringi& i2,
00232   stringc& atom1,
00233   stringc& atom2
00234 )
00235 {
00236   bool res;
00237   
00238   res=atomize_next(atom1);
00239   if (res==false)
00240     return false;
00241   i1=current;
00242 
00243   res=atomize_next(atom2);
00244   if (res==false)
00245     return false;
00246   i2=current;
00247 
00248   // Default to reset iterator to first tag.
00249   current=i1;
00250 
00251   return true;
00252 }
00253 
00254 boolc tokenizer::atomize_next_tag
00255 (
00256   liststringi& i1,
00257   liststringi& i2,
00258   stringc& tag
00259 )
00260 {
00261   liststringi iend_=seq.end();
00262   return atomize_next_tag(i1,i2,tag,iend_);
00263 }
00264 
00265 boolc tokenizer::atomize_next_tag
00266 (
00267   liststringi& i1,
00268   liststringi& i2,
00269   stringc& tag,
00270   liststringi& iend_
00271 )
00272 {
00273 //cout << "atomize_next_tag ";
00274   string tag1="<"+tag+">";
00275 //cout << SHOW(tag1) << " ";
00276   bool res;
00277 
00278   res=atomize_next(tag1,iend_);
00279   if (res==false)
00280     return false;
00281   i1=current;
00282 
00283   string tag2="</"+tag+">";
00284 //cout << SHOW(tag2) << " ";
00285   res=atomize_next(tag2,iend_);
00286   if (res==false)
00287     return false;
00288   i2=current;
00289 
00290   // Default to reset iterator to first tag.
00291   current=i1;
00292 // cout << SHOW(*current) << " 1" << endl;
00293 
00294   return true;
00295 }
00296 
00297 void tokenizer::atomize( stringc & atom )
00298 {
00299   liststringi i = seq.begin();
00300 
00301   for (;i!=seq.end(); ++i)
00302     atomize(i,atom);
00303 }
00304 
00305 void tokenizer::extractfromcurrent
00306 ( 
00307   vector<string> & v, 
00308   stringc & atom 
00309 ) const
00310 {
00311   v.clear();
00312 
00313   string s(*current);
00314 
00315   if (s.empty())
00316     return;
00317 
00318   string::size_type k;
00319 
00320   string::size_type const atomlen = atom.length();
00321 
00322   k = s.find(atom.c_str());
00323   for ( ; k!=string::npos; k=s.find(atom.c_str()) )
00324   {
00325     if (k==0)
00326     {
00327       s.erase(k,atomlen);
00328 
00329       continue;
00330     }
00331 
00332     v.push_back(s.substr(0,k));
00333     s.erase(0,k);
00334   }
00335 
00336   if (s.empty()==false)
00337     v.push_back(s);
00338 }
00339 
00340 void tokenizer::atomize
00341 ( 
00342   liststringi & i, 
00343   stringc & atom,
00344   string::size_type const k0 
00345 )
00346 {
00347   string::size_type const atomlen = atom.length();
00348   if (atomlen==0)
00349     return;
00350 
00351   // Valid iterator?
00352   assert(i!=seq.end());
00353 
00354   // If k0 is not a valid pointer into the string
00355   // k0 assumes find was previously called to find the string.
00356 
00357   string::size_type k;
00358   if (k0!=string::npos) 
00359   {
00360     assert(k0+atomlen-1<i->length());
00361     assert( i->substr(k0,atomlen)==atom );
00362     
00363     k=k0;
00364   }
00365   else
00366     k=i->find(atom.c_str());
00367 
00368   if (k==string::npos)
00369     return;
00370 
00371   liststringi w(i);
00372   ++w;
00373 
00374   if (k==0)
00375   {
00376     // If the string is already atomized exit.
00377     if (i->length()==atomlen)
00378       return;
00379 
00380     string s2 = i->substr(atomlen);
00381     i->erase(atomlen);
00382     seq.insert(w,s2);
00383 
00384     return;
00385   }
00386 
00387   string s2 = i->substr(k);
00388   i->erase(k);
00389   seq.insert(w,s2);
00390 }
00391 
00392 void tokenizer::readaslinesgeneral( stringc & data )
00393 {
00394   readaslines(data);
00395   subtract(",");
00396   subtract(" ");
00397   trim();
00398   remove_if();
00399 }
00400 
00401 void tokenizer::subtract( stringc & atom )
00402 {
00403   atomize(atom);
00404   seq.remove(atom);
00405 }
00406 
00407 ostream & tokenizer::print(ostream & os ) const
00408 {
00409   liststringic i = seq.begin();
00410   liststringic iend2 = seq.end();
00411   if (i!=iend2)
00412     os << *i;
00413   ++i;
00414   for ( ; i!=iend2; ++i )
00415   {
00416     os << printdelimiter << *i;
00417   }
00418 
00419   return os;
00420 }
00421 
00422 void tokenizer::reset()
00423 {
00424   current=seq.begin();
00425 }
00426 
00427 void tokenizer::operator ++ ()
00428 {
00429   if (current==seq.end())
00430     return;
00431 
00432   ++current;
00433 } 
00434 
00435 tokenizer::tokenizer()
00436   : printdelimiter("\n")
00437 {
00438   current=seq.end();
00439 }
00440 
00441 tokenizer::tokenizer(stringc & data)
00442 {
00443   seq.push_back(data);
00444   reset();
00445 }
00446 
00447 ostream & operator << (ostream & os, tokenizer const & ss)
00448 {
00449   return ss.print(os);
00450 }
00451 
00452 boolc tokenizer::operator == (tokenizer & t2)
00453 {
00454   reset();
00455   t2.reset();
00456   for ( ;!t2; ++t2 )
00457   {
00458     if (!(*this)==false)
00459       return false; 
00460 
00461     if ( (*this)() != t2() )
00462       return false;
00463 
00464     ++(*this);
00465   }
00466 
00467   if (!(*this))
00468     return false;
00469 
00470   return true;
00471 }
00472 
00473 void tokenizer::tokenize()
00474 {
00475   subtract(" ");
00476   trim();
00477   remove_if();
00478 }
00479 
00480 tokenizer::operator stringc ()
00481 {
00482   string s;
00483   tokenizer& tk(*this);
00484   for ( tk.reset(); !tk; ++tk)
00485     { s += tk(); }; 
00486   
00487   return s; 
00488 }
00489 
00490 /*
00491 boolc tokenizer::myfind(size_t k, stringc& atom, size_t k0)
00492 {
00493   return false;
00494 }
00495 
00496 boolc tokenizer::myfind(size_t k, stringc& atom)
00497 {
00498   return false;
00499 }
00500 */
00501 
00502 boolc tokenizer::find
00503 (
00504   string::size_type & k, 
00505   stringc & atom, 
00506   string::size_type const k0 
00507 )
00508 {
00509   string::size_type atomsize=atom.size();
00510   if (atomsize==0)
00511   {
00512     current=seq.end();
00513     return false;
00514   }
00515 
00516   liststringi i = current;
00517   assertreturnfalse(i!=seq.end());
00518 
00519   if (k0+atomsize-1<i->size())
00520   {
00521     k = i->find(atom.c_str(),k0);
00522     if (k!=string::npos)
00523       return true;
00524   }
00525     
00526   // failed to find in current string.
00527   ++current;
00528 
00529   return tokenizer::find(k,atom);
00530 }
00531 
00532 boolc tokenizer::find
00533 (
00534   string::size_type & k, 
00535   stringc & atom 
00536 )
00537 {
00538   liststringi i = current;
00539 
00540   for (;i!=seq.end(); ++i)
00541   {
00542     k = i->find(atom.c_str());
00543 
00544     if (k==string::npos)
00545       continue;
00546 
00547     current=i;
00548     return true;
00549   }
00550 
00551   return false;
00552 }
00553 
00554 
00555 
00556 

Generated on Fri Mar 4 00:49:29 2011 for Chelton Evans Source by  doxygen 1.5.8