Main Page   Class Hierarchy   File List  

ocXML.h

00001 #include <string>
00002 #include <iomanip>
00003 #include <algorithm>
00004 #include <map>
00005 #include <vector>
00006 #include <stack>
00007 #include <fstream>
00008 #include "ocString.h"
00009 
00010 #ifndef OC_XML_H
00011 #define OC_XML_H
00012 
00013 using namespace std;
00014 
00015 // define a base function object to be used by the parser
00016 class parseFobject
00017 {
00018 public:
00019   // ctor
00020   parseFobject()
00021   {;}
00022   // the function operator itself
00023   virtual void operator () ( ocString & in )
00024   {;}// default base method does nothing
00025 };
00026 
00027 // define the base parser
00028 class ocGenericParser
00029 {
00030 protected:
00031   istream * iStream;
00032   ocString  input;
00033   parseFobject * pCurrentFunction;
00034 public:
00035   ocGenericParser( istream * in ):iStream(in),pCurrentFunction(NULL)
00036   {
00037     ;
00038   }
00039   ocGenericParser( string & in ):iStream(NULL),pCurrentFunction(NULL)
00040   {
00041     input = in;
00042   }
00043   virtual ~ocGenericParser()
00044   {;}
00045   bool parse( void )
00046   {
00047     // the object will be entered with 1 of 2 constructors
00048     bool bGood=true;
00049     do
00050     {
00051       // IS a function, input to parse, input is not finished parsing
00052       while( pCurrentFunction && input.length() && !input.endOfParse() )
00053       {
00054         curFunc()(input);
00055       }
00056       if(iStream)
00057       {
00058         iStream->clear();
00059         getline(*iStream,input);
00060         input.parseInit();
00061         bGood = iStream->good();
00062       }
00063     } while( pCurrentFunction && iStream && !iStream->eof() );
00064     return bGood;
00065   }
00066   parseFobject & curFunc(void)
00067   {
00068     return *pCurrentFunction;
00069   }
00070   virtual void callback( void )
00071   {
00072     ;
00073   }
00074 };
00075 
00076 // Define a derived function object to overload
00077 class baseFunc: public parseFobject
00078 {
00079 protected:
00080   ocGenericParser & rParser; // reference to parser
00081 public:
00082   // ctor
00083   baseFunc(ocGenericParser & irParser):parseFobject(),rParser(irParser){;}
00084   // v dtor
00085   virtual ~baseFunc(){;}
00086 };
00087 
00088 /*
00089   Abstraction of a (flat) xml node
00090 */
00091 typedef class map<string,string> node_attr;
00092 class xmlNode
00093 {
00094 public:
00095   string name;
00096   string data;
00097   node_attr attr;
00098   bool monopole;
00099   xmlNode()
00100   :name(""),data(""),attr(),monopole(false)
00101   {;}
00102   xmlNode(const xmlNode & in)
00103   :name(in.name),data(in.data),attr(in.attr),monopole(in.monopole)
00104   {;}
00105   xmlNode & operator = (const xmlNode & in)
00106   {
00107     name=in.name;
00108     data=in.data;
00109     attr=in.attr;
00110     monopole=in.monopole;
00111     return *this;
00112   }
00113   string emit( void )
00114   {
00115     string ret = "<";
00116     ret += name;
00117     if( attr.size() )
00118     {      
00119       node_attr::iterator it;
00120       for( it = attr.begin(); it != attr.end(); ++it )
00121       {
00122         ret += " ";
00123         ret += it->first;
00124         ret += "='";
00125         ret += it->second;
00126         ret += "'";
00127       }      
00128     }
00129     if( monopole )
00130     {
00131       ret += "/>";
00132     }
00133     else
00134     {
00135       ret += ">";
00136       ret += data;
00137       ret += "</";
00138       ret += name;
00139       ret += ">";
00140     }
00141     return ret;
00142   }
00143 };
00144 
00145 /*
00146   This particular implementaion flattens the tree into a multimap.
00147 */
00148 typedef class vector<xmlNode> node_vector;
00149 typedef class multimap<string,size_t> node_map;
00150 typedef class stack<size_t> node_stack;
00151 struct stateMachine
00152 {
00153   node_vector nodes;
00154   node_stack nodestack;
00155   node_map   nodemap;
00156   xmlNode & topNode( void )
00157   {
00158     return nodes[nodestack.top()];
00159   }
00160 };
00161 
00162 // start with the XML operators
00163 class findStart: public baseFunc
00164 {
00165 public:
00166   ocString data;
00167   bool foundStart;
00168   findStart(ocGenericParser & irParser):baseFunc(irParser)
00169   {
00170     foundStart = false;
00171   }
00172   void operator () ( ocString & in )
00173   {
00174     data += in.parse("<");
00175     foundStart = (in.lastPos() > 0);
00176     if( foundStart )
00177     {
00178       rParser.callback();
00179       data = "";
00180     }
00181   }
00182 };
00183 
00184 
00185 class findEnd: public baseFunc
00186 {
00187 public:
00188   ocString data;
00189   bool foundEnd;
00190 
00191   findEnd(ocGenericParser & irParser):baseFunc(irParser)
00192   {
00193     foundEnd = false;
00194   }
00195 
00196   void operator () ( ocString & in )
00197   {
00198     data += in.parse(">");
00199     foundEnd = (in.lastPos() > 0);
00200     if( foundEnd )
00201     {
00202       rParser.callback();
00203       data = "";
00204     }
00205   }
00206 };
00207 /*
00208   Escape these values:
00209   int [34]  hex [22] = ["] = &#x22;
00210   int [37]  hex [25] = [%] = &#x25;
00211   int [38]  hex [26] = [&] = &#x26;
00212   int [39]  hex [27] = ['] = &#x27;
00213   int [43]  hex [2b] = [+] = &#x2b;
00214   int [60]  hex [3c] = [<] = &#x3c;
00215   int [62]  hex [3e] = [>] = &#x3e;
00216 
00217   in hex the escape would be &#xNN;
00218   The values will be unescaped by the xml parser.
00219   see W3C XML spec section 4.1 Character and Entity References
00220 */
00221 string xmlEscape( string in )
00222 {
00223   ocString temp ( in );
00224   return temp.replaceAll( "\"", "&#x22;" )
00225              .replaceAll( "%", "&#x25;" )
00226              .replaceAll( "&", "&#x26;" )
00227              .replaceAll( "'", "&#x27;" )
00228              .replaceAll( "+", "&#x2b;" )
00229              .replaceAll( "<", "&#x3c;" )
00230              .replaceAll( ">", "&#x3e;" );
00231 }
00232 string xmlUnescape( string in )
00233 {
00234   ocString temp ( in );
00235   return temp.replaceAll(  "&#x22;", "\"")
00236              .replaceAll(  "&#x25;", "%" )
00237              .replaceAll(  "&#x26;", "&" )
00238              .replaceAll(  "&#x27;", "'" )
00239              .replaceAll(  "&#x2b;", "+" )
00240              .replaceAll(  "&#x3c;", "<" )
00241              .replaceAll(  "&#x3e;", ">" );
00242 }
00243 
00244 // Now make the special xml flavor of the parser
00245 class xmlParser : public ocGenericParser
00246 {
00247 private:
00248   // function objects
00249   findStart    start;
00250   findEnd      end;  
00251   node_vector::iterator xnode_it;
00252   
00253   void startCallback(void)
00254   {
00255     size_t len = start.data.length();
00256     if( len && !states.nodestack.empty() )
00257     {
00258       // Add data to the last known top node
00259       xmlNode & rNode =  states.topNode();
00260       rNode.data += xmlUnescape(start.data);
00261     }
00262     pCurrentFunction = &end;
00263   }
00264   void endCallback(void)
00265   {
00266     ocString parseableData = end.data;
00267     size_t len = end.data.length();
00268     if( len == 0 ) return;
00269 
00270     // see if this is the end ex: </tag>
00271     bool isEndTag = end.data[0] == '/';
00272     // see if this is a monopole ex: <tag/>
00273     bool isMonoTag = end.data[len-1] == '/';
00274     if( isMonoTag )
00275     {
00276       len--;
00277       parseableData.resize(len);
00278     }
00279     if( !isEndTag || isMonoTag )
00280     {
00281       // This is the beginning container tag, or a monotag
00282       // EX: [name attr1='x' attr2="y z" ...]
00283       //   so parse the name and attributes:
00284       xmlNode node;
00285       // Get the name
00286       node.name = parseableData.tokenParse( " \t\n\r" );
00287       if( isMonoTag ) node.monopole = true;
00288       // Get the attributes (if any)
00289       do
00290       {
00291         // EX: [attr1='x' attr2="y z" ...]
00292         string attrCandidate = parseableData.parse( "=" );
00293         // for attributes after the first attribute, there will be a space...
00294         while( attrCandidate.size() && (
00295                attrCandidate[0] == ' ' ||
00296                attrCandidate[0] == '\n' || 
00297                attrCandidate[0] == '\t' ) ) attrCandidate.erase(0,1);
00298         if(attrCandidate.length())
00299         {
00300           string attrName = attrCandidate;
00301           // EX: ['x' attr2="y z" ...]
00302           // build token on valid chars ' or "
00303           char pTok[2];
00304           pTok[0] = parseableData.remainder()[0];
00305           pTok[1] = '\0';
00306           parseableData.parse( pTok ); // strip the first quote
00307           string attrValue = parseableData.parse( pTok ); // get the value
00308           // EX: [ attr2="y z" ...] NOTE THE SPACE
00309           if( attrName.length() && attrValue.length() )
00310           {
00311             node.attr.insert(make_pair(attrName,xmlUnescape(attrValue)));
00312           }
00313         }
00314       } while(!parseableData.endOfParse());
00315       // add the node
00316       addNode(node);
00317             
00318       if( !isMonoTag )
00319       {
00320         // Push the last item on the stack
00321         //   so data can be added to the parent
00322         //   node in a nested doc.
00323         size_t nTop = states.nodes.size()-1;
00324         states.nodestack.push(nTop);
00325       }
00326     } // End if start of container
00327     if(isEndTag)
00328     {
00329       // This contained item is finished so
00330       //   pop the last item off the stack
00331       states.nodestack.pop();
00332     }
00333     pCurrentFunction = &start;
00334   }
00335 public:
00336   // state machine
00337   stateMachine states;
00338 
00339   xmlParser( istream * in ):ocGenericParser(in),start(*this),end(*this)
00340   {
00341     pCurrentFunction = &start;
00342   }
00343   xmlParser(string & in):ocGenericParser(in),start(*this),end(*this)
00344   {
00345     pCurrentFunction = &start;
00346   }
00347   virtual ~xmlParser(){;}
00348 
00349   virtual void callback( void )
00350   {
00351     if( pCurrentFunction == &start )
00352     {
00353       startCallback();
00354     }
00355     else if(pCurrentFunction == &end)
00356     {
00357       endCallback();
00358     }
00359   }
00360   // Return the node list
00361   node_vector & nodeList( void )
00362   {
00363     return states.nodes;
00364   }
00365   // Add a new node, outside of the parse
00366   void addNode( xmlNode &node )
00367   {
00368     states.nodes.push_back(node);
00369     states.nodemap.insert(make_pair(node.name,states.nodes.size()-1));
00370   }
00371   
00372   // Re-emit as XML
00373   string emit( void )
00374   {
00375     string ret;
00376     for(int i=0;i<states.nodes.size();i++)
00377     {
00378       ret += states.nodes[i].emit();
00379     }
00380     return ret;
00381   }
00382   
00383   
00384   // Find the first instance of a node that has an attribute 'name' having 'value'
00385   node_vector::iterator & findFirstNodeByAttribute( string name, string value )
00386   {
00387     xnode_it = states.nodes.begin();    
00388     for( ;xnode_it!=states.nodes.end();++xnode_it)
00389     {
00390       node_attr::iterator it = (*xnode_it).attr.find(name);
00391       if( it != (*xnode_it).attr.end() )
00392       {
00393         if( it->second == value ) 
00394         {             
00395           break;
00396         }  
00397       }    
00398     }
00399     return xnode_it;
00400   } 
00401 };
00402 
00403 /*
00404 Usage example:
00405 int main( int argc, char * argv[] )
00406 {
00407 
00408   xmlParser parser( &cin );
00409   parser.parse();
00410   node_vector & xnodes = parser.nodeList();
00411   int i;
00412 
00413   for(i=0;i<xnodes.size();i++)
00414   {
00415     xmlNode & node = xnodes[i];
00416     cout << "Node: " << node.name << endl
00417          << " contains: " << node.data << endl;
00418     cout << "node attributes" << endl << "=================" << endl;
00419     node_attr::iterator x;
00420     for( x=node.attr.begin(); x!=node.attr.end(); ++x)
00421     {
00422       cout << x->first << " = " << x->second << endl;
00423     }
00424     cout << "=================" << endl;
00425   }
00426   return 0;
00427 }
00428 
00429 */
00430 #endif
00431 

Generated on Tue Jan 20 09:03:27 2004 for OpenTools by doxygen1.2.18