/* file: xmlparse2.cpp * author: marcus fritzsch, http://fritschy.de * desc: a dumb xml * dumb means no idea of charsets, no idea of xml prolog, * no idea of links, no idea of doctype * date: 2007-05-14 * license: gpl ver. 2 or any later version * see: http://www.w3.org/TR/REC-xml/ * compile: g++ -O2 -Wall -pedantic -ansi xmlparse.cpp -o xmlparse * optional: -DBENCHMARK=100 */ #include #include #include #include #include #include #include #include #include namespace xml { class error : public std::exception { private: const char * m; public: error (const char * msg) throw () : m (msg) {} virtual ~error () throw () {} virtual const char * what () const throw () { return m; } error (const error & e) throw () : m (e.m) {} error & operator = (const error & e) throw () { m = e.m; return *this; } }; class Node { public: typedef std::string name_type; typedef std::string attr_name_type; typedef std::string attr_val_type; typedef std::pair attr_type; typedef std::list attrs_type; typedef std::list childs_type; typedef std::string content_type; Node () throw () : name(), attrs(), content(), childs() {} name_type name; attrs_type attrs; content_type content; childs_type childs; friend std::istream & operator >> (std::istream & is, Node & node); }; class Parser { public: enum Token { NAME, // a letter, in my case a-zA-Z EQ, // '=' VALUE, // an attribute value TAG_OR_DATA, // expect a new tag or #PCDATA TAG_NAME, // is followed by BLANK|/|>|NAME BLANK_END_OR_NAME, // everything that might be after a TAG_NAME TAG_END, DATA, // tags content PROLOGUE, // xml prologue COMMENT, NONE }; Parser () throw () : root(), file() {} void parse (const std::string & f) throw (error); void parse (std::istream & str) throw (error); const Node & getRoot () const throw () { return root; } struct ParseNode { Node & node; Token expect; ParseNode (Node & n, Token e) throw () : node (n), expect(e) {} std::istream & in (std::istream & is) throw (error); }; private: Node root; std::string file; void priv_parse (std::istream & str) throw (error); static char get_char (std::istream & is) throw (error); static char peek_char (std::istream & is) throw (error); static void normalize_white (std::istream & is) throw (error); static void eat_white (std::istream & is) throw (error); static bool is_name (char c) throw (); static bool is_name_char (char c) throw (); static std::string until_end_of (std::istream & is, Token stop) throw (error); }; inline char Parser::get_char (std::istream & is) throw (error) { char c = is.get (); if (c == -1) throw error ("unexpected eof"); return c; } inline char Parser::peek_char (std::istream & is) throw (error) { char c = is.peek (); if (c == -1) throw error ("unexpected eof"); return c; } inline void Parser::normalize_white (std::istream & is) throw (error) { char c; while (std::isspace (c = get_char (is)) && std::isspace (peek_char (is))) ; is.putback (c); } /** * @see http://www.w3.org/TR/2004/REC-xml-20040204/#NT-NameChar */ inline void Parser::eat_white (std::istream & is) throw (error) { char c; while (std::isspace (c = get_char (is))) ; is.putback (c); } /** * @see http://www.w3.org/TR/2004/REC-xml-20040204/#NT-Name */ inline bool Parser::is_name (char c) throw () { return std::isalpha (c) || c == '_' || c == ':'; } inline bool Parser::is_name_char (char c) throw () { return is_name (c) || std::isdigit (c) || c == '-' || c == '.'; } /** * this one does data reading and skipping of tokens i'd like to ignore */ inline std::string Parser::until_end_of (std::istream & is, Token stop) throw (error) { char c; std::string r; bool first = true; bool s = false; while ((c = get_char (is)) && ( s // we are inside a string and behave accordingly || (stop == TAG_NAME && (first ? is_name (c) : is_name_char (c))) || (stop == NAME && (first ? is_name (c) : is_name_char (c))) || (stop == VALUE && c != '"') || (stop == DATA && c != '<') || (stop == PROLOGUE && c != '>') || stop == COMMENT )) { normalize_white (is); // untested... --fritschy, 2006-05-24 if (stop == PROLOGUE && c == '<') { if (get_char (is) != '!') throw error ("unexpected character in PI or DOCTYPE"); until_end_of (is, PROLOGUE); } if (stop == DATA && c == '>') throw error ("expected data, got '>'"); else { if (c == '"') s = !s; if (COMMENT && c == '-') { char c0 = get_char (is); char c1 = peek_char (is); if (c0 == c && c1 == '>') { get_char (is); return r + "-->"; } is.putback (c0); } r += c; first = false; } } if (stop == TAG_NAME) { if (! std::isspace (c) && c != '/' && c != '>') throw error ("unexpected character"); } else if (stop == NAME) { if (! std::isspace (c) && c != '=') throw error ("unexpected character"); } if (stop != PROLOGUE) is.putback (c); return r; } inline std::istream & operator >> (std::istream & is, Parser::ParseNode & n) throw (error) { return n.in (is); } std::istream & Parser::ParseNode::in (std::istream & is) throw (error) { char c; bool tag_ends = false; std::string a_name; std::string a_val; std::string t_name; std::string c_data; while (1) { normalize_white (is); c = get_char (is); if (expect == TAG_NAME && is_name (c)) { is.putback (c); t_name = until_end_of (is, TAG_NAME); if (tag_ends) { if (t_name != node.name) throw error ("start and end tag mismatch"); eat_white (is); c = get_char (is); if (c != '>') throw error ("n.expected end of tag..."); break; } else node.name = t_name; expect = BLANK_END_OR_NAME; } else if (expect == BLANK_END_OR_NAME && c == '/') { expect = TAG_END; tag_ends = true; } else if ( (expect == BLANK_END_OR_NAME && c == '>') || (expect == TAG_END && c == '>')) { if (tag_ends) break; expect = TAG_OR_DATA; } else if (expect == BLANK_END_OR_NAME && is_name (c)) { is.putback (c); a_name = until_end_of (is, NAME); expect = EQ; } else if (expect == EQ && c == '=') { char c0 = get_char (is); if (c0 != '"') throw error ("n.expected '\"' - found something else"); a_val = until_end_of (is, VALUE); c = get_char (is); if (c != '"') throw error ("n.expected '\"' - found something else"); node.attrs.push_back (std::make_pair (a_name, a_val)); expect = BLANK_END_OR_NAME; } else if (expect == TAG_OR_DATA && c == '<') { eat_white (is); char c0 = get_char (is); if (is_name (c0)) { Node nn; ParseNode pn (nn, TAG_NAME); is.putback (c0); is >> pn; node.childs.push_back (pn.node); expect = TAG_OR_DATA; } else if (c0 == '/') { tag_ends = true; expect = TAG_NAME; } } else if (expect == TAG_OR_DATA) { is.putback (c); eat_white (is); node.content = until_end_of (is, DATA); if (c == '<') expect = BLANK_END_OR_NAME; } } return is; } void Parser::priv_parse (std::istream & is) throw (error) { bool tag = false; while (1) { char c0; try { eat_white (is); c0 = get_char (is); } catch (error & e) { break; } if (c0 != '<') throw error ("document not well formed!"); c0 = get_char (is); tag = true; if (c0 == '?') until_end_of (is, PROLOGUE); else if (c0 == '!') { char c1 = get_char (is), c2; if (c1 != '-') { // doctype? until_end_of (is, PROLOGUE); // ah... quick 'n dumb, need to test continue; } c2 = get_char (is); if (c2 == '-') { // comment until_end_of (is, COMMENT); // same as DOCTYPE thing continue; } throw error ("unexpected character ! after <"); } else if (is_name (c0)) { // a normal tag is.putback (c0); ParseNode pn (root, TAG_NAME); is >> pn; } } } inline void Parser::parse (const std::string & f) throw (error) { file = f; std::fstream is (f.c_str ()); priv_parse (is); is.close (); } inline void Parser::parse (std::istream & is) throw (error) { file = "-"; priv_parse (is); } struct StylePolicyXML { void print (const Node & cur, std::ostream & os, const std::string & pfx = "") const throw (); }; void StylePolicyXML::print (const Node & cur, std::ostream & os, const std::string & pfx) const throw () { os << pfx << '<' << cur.name; for (Node::attrs_type::const_iterator i = cur.attrs.begin (); i != cur.attrs.end (); i++) os << ' ' << i->first << "=\"" << i->second << '"'; if (cur.content.length () || cur.childs.size ()) { os << '>' << std::endl; if (cur.content.length ()) os << pfx << " " << cur.content << std::endl; for (Node::childs_type::const_iterator i = cur.childs.begin (); i != cur.childs.end (); i++) print (*i, os, pfx + " "); os << pfx << "'; } else os << " />"; os << std::endl; } struct StylePolicySimple { void print (const Node & cur, std::ostream & os, const std::string & pfx = "") const throw (); }; void StylePolicySimple::print (const Node & cur, std::ostream & os, const std::string & pfx) const throw () { os << pfx << cur.name << std::endl; for (Node::attrs_type::const_iterator i = cur.attrs.begin (); i != cur.attrs.end (); i++) os << pfx << " " << "attr, " << i->first << "='" << i->second << "'" << std::endl; if (cur.content.length () > 0) os << pfx << " " << "content = '" << cur.content << "'" << std::endl; for (Node::childs_type::const_iterator i = cur.childs.begin (); i != cur.childs.end (); i++) print (*i, os, pfx + " "); } template class Printer : public StylePolicy { public: Printer (const Node & n) throw () : node (n) {} const Node & node; }; template std::ostream & operator << (std::ostream & os, const Printer & pr) throw () { pr.print (pr.node, os); return os; } typedef Printer PrinterXML; typedef Printer PrinterSimple; } // namespace xml double seconds () { struct timeval tv; gettimeofday (&tv, NULL); return double(tv.tv_sec) + tv.tv_usec / 1e6; } int main (int argc, char ** argv) { char * xml = "< title style=\"bold\">a " " test< /title> test content < / content>"; std::istringstream ins (xml); std::cerr << "xmlparse, a dumb xml parser (maybe)"; std::cerr << " sizeof (node): " << sizeof (xml::Node) << std::endl << std::endl; double start = seconds (); xml::Parser parser; parser.parse (ins); start = seconds () - start; std::cerr << "test took " << start << " seconds" << std::endl; std::cout << xml::PrinterXML (parser.getRoot ()) << std::endl; int i = 1; while (argv [i]) { #ifdef BENCHMARK double a=std::numeric_limits ::max (), b=0, c=std::numeric_limits ::min (); for (int j = 0; j < BENCHMARK; ++j) { #endif // BENCHMARK parser = xml::Parser (); double start = seconds (); parser.parse (argv [i]); start = seconds () - start; #ifndef BENCHMARK std::cerr << "file: " << argv [i] << ", took " << start << " seconds" << std::endl; std::cout << xml::PrinterXML (parser.getRoot ()) << std::endl; #else a=std::min(start,a); b+=start; c=std::max(start,c); } std::cerr.precision (5); std::cerr << "Timing " << argv[i] << " (min, avg, max) = (" << a << ", " << b/BENCHMARK << ", " << c << ")" << std::endl; #endif ++i; } return 0; } // vim:set foldmethod=marker foldmarker={,} foldminlines=5: