#ifndef TOKENIZER_H 
#define TOKENIZER_H

#include <cassert>
#include <string>
#include <vector>
#include <iostream>
#include <list>
#include <algorithm>
using namespace std;

#include <stringspace.h>

typedef list<string>::iterator liststringi;
typedef list<string>::const_iterator liststringic;

/*!
\brief Miscellaneous functions which support or use the tokenizer. 

For example converting a file to a string and then using
 string processing as the processing domain, rather than
 working with streams.
*/
class tokenizermisc
{
public:

  /** Miscellaneous function to read a file into a string. */
  static boolc readfile(string & str, stringc & fname);

  /** Compares two strings ignoring all space. */
  static boolc comparewithoutspace(stringc & s1, stringc & s2);

};


/*!
\brief  Primitive parser with state.

Generally concerned with global scope.

This class has two roles. Firstly it is a primitive
 parser. Secondly it has a state so this class is  
 used by another to point to the current token.

I refer to a stream to mean the list of 
 string tokens with a current position in the list.

Use: read in the file and call atomize to separate
 the sequence into individual elements that
 can be interpreted by an interpreter.   

Can directly operate on the data though the list class.

Example 1. Remove comments and delete any empty strings. 
\verbatim
  tokenizer ss;
  ...
  ss.stripcomment("//");
  ss.seq.remove("");
\endverbatim

See vrmlshape where I wrote a VRML parser using this 
 class to hold the current state.
*/
class tokenizer
{
  /** Split a string token by an atom. */
  void atomize( liststringi & i, stringc & atom, string::size_type const k0=string::npos );
public:

  /** The current state. */
  liststringi current;

  /** Data representation. */
  list<string> seq;

  //
  // Iterator Characteristics
  // 

  /** Resets token iteration. */
  void reset();
  /** Can the stream be read?  */
  boolc operator ! () const; 
  /** Modify the current token. */
  string & operator * ();
  /** Access the current token in the stream. */
  stringc & operator() () const; 
  /** Increment the stream's index. */
  void operator ++ ();

  //
  // Processing each element in the list. 
  //

  /** Splits strings. */
  void atomize(stringc & atom);
  /** Splits strings removing the atom. */
  void subtract( stringc & atom );
  /** \n delimiter set. */
  void tokenize();
  /** For each line delete right of and including the 
      comment if it exists in the line. */
  void stripcomment( stringc & comment );
  /** Remove matching tokens. */
  void remove(stringc & token);
  /** Remove invalid lines as interpreted by myspacer. */
  void remove_if();
  /** Remove invalid lines. */
  template< typename SPACER >
  void remove_if(SPACER spacer)
    { seq.remove_if(spacer); }

  /** Apply the functional object x to each element. */
  template< typename X >
  void apply(X x)
  { 
    liststringi i = seq.begin();
    liststringic imax = seq.end();
    for ( ; i!=imax; ++i )
      { x(*i); }
  };
  
  /** Trim the leading and trailing space of each token. */
  void trim();
  /** Remove surrounding white space and delete empty strings. */
  void trim_and_prune();

  //
  // Miscellaneous
  //

  /** The current line is read and parsed, split by the 
      atom and each element is written to the vector v. */
  void extractfromcurrent
  ( 
    vector<string> & v, 
    stringc & atom 
  ) const;

  //
  // Printing
  //

  /** Each element printed is separated by the delimiter. */
  string printdelimiter;
  /** Print seq. Set the printdelimiter to any string. */
  ostream& print(ostream& os) const;

  //
  // Reading
  //

  // Note: constructor by default calls reset().
  //   read* does not have this implemented.

  /** Read the data in. */
  void read(stringc & data);
  /** Read the string as lines.  Each line is a string. */
  void readaslines(stringc & data);
  /** Because special characters in strings can interfere with reading strings, 
      this routine is a hack to remove white spaces, commas and empty lines.*/
  void readaslinesgeneral(stringc & data);

  //
  // Construction
  //

  /** Construct in uninitialized state. */
  tokenizer();

  /** Construct with an initial string. */
  tokenizer(stringc & data);
  /** Data copy */
// TODO?  tokenizer(tokenizer const & tk0);

  /** Compare tokenizers by comparing each token. */
  boolc operator == (tokenizer & t2);
  
  //
  // Parsing and tags
  //

  /** Non destructive text search */
  boolc find(string::size_type & k, stringc & atom );
 
  /** Search from k0 in the current string. */
  boolc find( string::size_type& k, stringc & atom, string::size_type const k0 );

  /** Search for the first atom and atomize it. current points to it.
 * Essentially this is a find function. */
  boolc atomize_next( stringc & atom ); 

  /** */
  boolc atomize_next 
  (
    stringc & atom,
    liststringi & iend_
  );

  /** tag=cat then i1 points to <cat>, i2 to </cat>. */
  boolc atomize_next_tag
  (
    liststringi& i1,
    liststringi& i2,
    stringc& tag,
    liststringi& iend_
  );

  boolc atomize_next_tag
  (
    liststringi& i1,
    liststringi& i2,
    stringc& tag
  );

  /* Search for sequence bounded by two atoms. */
  boolc atomize_next
  (
    liststringi& i1,
    liststringi& i2,
    stringc& atom1,
    stringc& atom2
  );

  /** Output the tokenizer back as a string. */
  operator stringc ();

};

/** Print the tokenizers list of strings. The tokenizer ss 
    can have its printdelimiter configured before printing. */
ostream & operator << (ostream & os, tokenizer const & ss);

#endif



