Files Classes Functions Hierarchy
00001 #include <tokenizer.h> 00002 00003 #include <fstream> 00004 #include <sstream> 00005 using namespace std; 00006 00007 00008 boolc tokenizer::operator ! () const 00009 { 00010 return (current != seq.end()); 00011 } 00012 00013 string & tokenizer::operator * () 00014 { 00015 assert(current != seq.end()); 00016 return *current; 00017 } 00018 00019 stringc & tokenizer::operator() () const 00020 { 00021 assert(current != seq.end()); 00022 return *current; 00023 } 00024 00025 void tokenizer::remove(stringc & token) 00026 { 00027 seq.remove(token); 00028 } 00029 00030 void tokenizer::remove_if() 00031 { 00032 seq.remove_if(spacerdelete<>()); 00033 } 00034 00035 void tokenizer::trim() 00036 { 00037 apply(spacertrim<>()); 00038 } 00039 00040 void tokenizer::trim_and_prune() 00041 { 00042 trim(); 00043 remove_if(spacerdelete<>()); 00044 } 00045 00046 void tokenizer::read(stringc & data) 00047 { 00048 seq.push_back(data); 00049 } 00050 00051 void tokenizer::readaslines(stringc & data) 00052 { 00053 seq.push_back(data); subtract("\n"); 00054 } 00055 00056 00057 00058 boolc tokenizermisc::comparewithoutspace 00059 ( 00060 stringc & s1, 00061 stringc & s2 00062 ) 00063 { 00064 tokenizer t1(s1); 00065 t1.atomize(" "); 00066 t1.atomize("\n"); 00067 t1.atomize("\t"); 00068 t1.trim(); 00069 t1.remove_if(spacerdelete<>()); 00070 00071 tokenizer t2(s2); 00072 t2.atomize(" "); 00073 t2.atomize("\n"); 00074 t2.atomize("\t"); 00075 t2.trim(); 00076 t2.remove_if(spacerdelete<>()); 00077 00078 return t1==t2; 00079 } 00080 00081 boolc tokenizermisc::readfile 00082 ( 00083 string & str, 00084 stringc & fname 00085 ) 00086 { 00087 assert(false); // Use stringserialization::serialize(str,filename); 00088 ifstream file(fname.c_str()); 00089 if (!file) 00090 return false; 00091 00092 stringstream ss; 00093 ss << file; 00094 00095 str = ss.str(); 00096 00097 00098 // char ch; 00099 00100 /* 00101 if (file.get(ch)) 00102 { 00103 str.push_back(ch); 00104 for ( ; file.get(ch); ) 00105 { 00106 str.push_back(ch) 00107 }; 00108 } 00109 */ 00110 00111 00112 //while (file.get(ch)!= EOF) 00113 /* 00114 while ( file.get(ch) ) 00115 { 00116 str.push_back(ch) 00117 }; 00118 */ 00119 00120 return true; 00121 } 00122 00123 void tokenizer::stripcomment(stringc & comment) 00124 { 00125 liststringi k = seq.begin(); 00126 string::size_type i; 00127 for (;k!=seq.end(); ++k) 00128 { 00129 string & token(*k); 00130 i=0; 00131 i = token.find(comment,i); 00132 if (i==string::npos) 00133 continue; 00134 00135 token.erase(i); 00136 } 00137 } 00138 00139 00140 00141 boolc tokenizer::atomize_next 00142 ( 00143 stringc & atom 00144 ) 00145 { 00146 liststringi i = current; 00147 00148 string::size_type k; 00149 00150 string::size_type const atomlen = atom.length(); 00151 00152 for (;i!=seq.end(); ++i) 00153 { 00154 k = i->find(atom.c_str()); 00155 00156 if (k==string::npos) 00157 continue; 00158 00159 atomize(i,atom,k); 00160 if (i->length()==atomlen) 00161 { 00162 if (atom==*i) 00163 { 00164 current=i; 00165 return true; 00166 } 00167 } 00168 00169 ++i; 00170 atomize(i,atom,0); 00171 assert(i->length()==atomlen); 00172 current=i; 00173 return true; 00174 } 00175 00176 return false; 00177 } 00178 00179 00180 00181 00182 00183 00184 00185 boolc tokenizer::atomize_next 00186 ( 00187 stringc & atom, 00188 liststringi & iend_ 00189 ) 00190 { 00191 liststringi i = current; 00192 00193 string::size_type k; 00194 00195 string::size_type const atomlen = atom.length(); 00196 00197 for (;i!=iend_; ++i) 00198 { 00199 k = i->find(atom.c_str()); 00200 00201 if (k==string::npos) 00202 continue; 00203 00204 atomize(i,atom,k); 00205 if (i->length()==atomlen) 00206 { 00207 if (atom==*i) 00208 { 00209 current=i; 00210 return true; 00211 } 00212 } 00213 00214 ++i; 00215 atomize(i,atom,0); 00216 assert(i->length()==atomlen); 00217 current=i; 00218 return true; 00219 } 00220 00221 return false; 00222 } 00223 00224 00225 00226 00227 00228 boolc tokenizer::atomize_next 00229 ( 00230 liststringi& i1, 00231 liststringi& i2, 00232 stringc& atom1, 00233 stringc& atom2 00234 ) 00235 { 00236 bool res; 00237 00238 res=atomize_next(atom1); 00239 if (res==false) 00240 return false; 00241 i1=current; 00242 00243 res=atomize_next(atom2); 00244 if (res==false) 00245 return false; 00246 i2=current; 00247 00248 // Default to reset iterator to first tag. 00249 current=i1; 00250 00251 return true; 00252 } 00253 00254 boolc tokenizer::atomize_next_tag 00255 ( 00256 liststringi& i1, 00257 liststringi& i2, 00258 stringc& tag 00259 ) 00260 { 00261 liststringi iend_=seq.end(); 00262 return atomize_next_tag(i1,i2,tag,iend_); 00263 } 00264 00265 boolc tokenizer::atomize_next_tag 00266 ( 00267 liststringi& i1, 00268 liststringi& i2, 00269 stringc& tag, 00270 liststringi& iend_ 00271 ) 00272 { 00273 //cout << "atomize_next_tag "; 00274 string tag1="<"+tag+">"; 00275 //cout << SHOW(tag1) << " "; 00276 bool res; 00277 00278 res=atomize_next(tag1,iend_); 00279 if (res==false) 00280 return false; 00281 i1=current; 00282 00283 string tag2="</"+tag+">"; 00284 //cout << SHOW(tag2) << " "; 00285 res=atomize_next(tag2,iend_); 00286 if (res==false) 00287 return false; 00288 i2=current; 00289 00290 // Default to reset iterator to first tag. 00291 current=i1; 00292 // cout << SHOW(*current) << " 1" << endl; 00293 00294 return true; 00295 } 00296 00297 void tokenizer::atomize( stringc & atom ) 00298 { 00299 liststringi i = seq.begin(); 00300 00301 for (;i!=seq.end(); ++i) 00302 atomize(i,atom); 00303 } 00304 00305 void tokenizer::extractfromcurrent 00306 ( 00307 vector<string> & v, 00308 stringc & atom 00309 ) const 00310 { 00311 v.clear(); 00312 00313 string s(*current); 00314 00315 if (s.empty()) 00316 return; 00317 00318 string::size_type k; 00319 00320 string::size_type const atomlen = atom.length(); 00321 00322 k = s.find(atom.c_str()); 00323 for ( ; k!=string::npos; k=s.find(atom.c_str()) ) 00324 { 00325 if (k==0) 00326 { 00327 s.erase(k,atomlen); 00328 00329 continue; 00330 } 00331 00332 v.push_back(s.substr(0,k)); 00333 s.erase(0,k); 00334 } 00335 00336 if (s.empty()==false) 00337 v.push_back(s); 00338 } 00339 00340 void tokenizer::atomize 00341 ( 00342 liststringi & i, 00343 stringc & atom, 00344 string::size_type const k0 00345 ) 00346 { 00347 string::size_type const atomlen = atom.length(); 00348 if (atomlen==0) 00349 return; 00350 00351 // Valid iterator? 00352 assert(i!=seq.end()); 00353 00354 // If k0 is not a valid pointer into the string 00355 // k0 assumes find was previously called to find the string. 00356 00357 string::size_type k; 00358 if (k0!=string::npos) 00359 { 00360 assert(k0+atomlen-1<i->length()); 00361 assert( i->substr(k0,atomlen)==atom ); 00362 00363 k=k0; 00364 } 00365 else 00366 k=i->find(atom.c_str()); 00367 00368 if (k==string::npos) 00369 return; 00370 00371 liststringi w(i); 00372 ++w; 00373 00374 if (k==0) 00375 { 00376 // If the string is already atomized exit. 00377 if (i->length()==atomlen) 00378 return; 00379 00380 string s2 = i->substr(atomlen); 00381 i->erase(atomlen); 00382 seq.insert(w,s2); 00383 00384 return; 00385 } 00386 00387 string s2 = i->substr(k); 00388 i->erase(k); 00389 seq.insert(w,s2); 00390 } 00391 00392 void tokenizer::readaslinesgeneral( stringc & data ) 00393 { 00394 readaslines(data); 00395 subtract(","); 00396 subtract(" "); 00397 trim(); 00398 remove_if(); 00399 } 00400 00401 void tokenizer::subtract( stringc & atom ) 00402 { 00403 atomize(atom); 00404 seq.remove(atom); 00405 } 00406 00407 ostream & tokenizer::print(ostream & os ) const 00408 { 00409 liststringic i = seq.begin(); 00410 liststringic iend2 = seq.end(); 00411 if (i!=iend2) 00412 os << *i; 00413 ++i; 00414 for ( ; i!=iend2; ++i ) 00415 { 00416 os << printdelimiter << *i; 00417 } 00418 00419 return os; 00420 } 00421 00422 void tokenizer::reset() 00423 { 00424 current=seq.begin(); 00425 } 00426 00427 void tokenizer::operator ++ () 00428 { 00429 if (current==seq.end()) 00430 return; 00431 00432 ++current; 00433 } 00434 00435 tokenizer::tokenizer() 00436 : printdelimiter("\n") 00437 { 00438 current=seq.end(); 00439 } 00440 00441 tokenizer::tokenizer(stringc & data) 00442 { 00443 seq.push_back(data); 00444 reset(); 00445 } 00446 00447 ostream & operator << (ostream & os, tokenizer const & ss) 00448 { 00449 return ss.print(os); 00450 } 00451 00452 boolc tokenizer::operator == (tokenizer & t2) 00453 { 00454 reset(); 00455 t2.reset(); 00456 for ( ;!t2; ++t2 ) 00457 { 00458 if (!(*this)==false) 00459 return false; 00460 00461 if ( (*this)() != t2() ) 00462 return false; 00463 00464 ++(*this); 00465 } 00466 00467 if (!(*this)) 00468 return false; 00469 00470 return true; 00471 } 00472 00473 void tokenizer::tokenize() 00474 { 00475 subtract(" "); 00476 trim(); 00477 remove_if(); 00478 } 00479 00480 tokenizer::operator stringc () 00481 { 00482 string s; 00483 tokenizer& tk(*this); 00484 for ( tk.reset(); !tk; ++tk) 00485 { s += tk(); }; 00486 00487 return s; 00488 } 00489 00490 /* 00491 boolc tokenizer::myfind(size_t k, stringc& atom, size_t k0) 00492 { 00493 return false; 00494 } 00495 00496 boolc tokenizer::myfind(size_t k, stringc& atom) 00497 { 00498 return false; 00499 } 00500 */ 00501 00502 boolc tokenizer::find 00503 ( 00504 string::size_type & k, 00505 stringc & atom, 00506 string::size_type const k0 00507 ) 00508 { 00509 string::size_type atomsize=atom.size(); 00510 if (atomsize==0) 00511 { 00512 current=seq.end(); 00513 return false; 00514 } 00515 00516 liststringi i = current; 00517 assertreturnfalse(i!=seq.end()); 00518 00519 if (k0+atomsize-1<i->size()) 00520 { 00521 k = i->find(atom.c_str(),k0); 00522 if (k!=string::npos) 00523 return true; 00524 } 00525 00526 // failed to find in current string. 00527 ++current; 00528 00529 return tokenizer::find(k,atom); 00530 } 00531 00532 boolc tokenizer::find 00533 ( 00534 string::size_type & k, 00535 stringc & atom 00536 ) 00537 { 00538 liststringi i = current; 00539 00540 for (;i!=seq.end(); ++i) 00541 { 00542 k = i->find(atom.c_str()); 00543 00544 if (k==string::npos) 00545 continue; 00546 00547 current=i; 00548 return true; 00549 } 00550 00551 return false; 00552 } 00553 00554 00555 00556
1.5.8