src/parsers.hxx

   1 /*
   2     Copyright 2013 Karel Matas
   3
   4     This program is free software: you can redistribute it and/or modify
   5     it under the terms of the GNU General Public License as published by
   6     the Free Software Foundation, either version 3 of the License, or
   7     (at your option) any later version.
   8
   9     This program is distributed in the hope that it will be useful,
  10     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12     GNU General Public License for more details.
  13
  14     You should have received a copy of the GNU General Public License
  15     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16 */
  17 #ifndef _PARSERS_HXX
  18 #define _PARSERS_HXX
  19
  20 /*! \file parsers.hxx
  21 * Parsers for JMdict and kanjidic2.
  22 */
  23
  24
  25 #include <vector>
  26 #include <string>
  27 #include <map>
  28 #include <fstream>
  29 #include "3rdparty/rapidxml.hpp"
  30 #include "utils.hxx"
  31 #include "datatypes.hxx"
  32
  33 using aoi::SEPARATOR_SQL;
  34 using aoi::ElementKanji;
  35 using aoi::ElementReading;
  36 using aoi::ElementSense;
  37 using aoi::DicWord;
  38 using aoi::Kanji;
  39 using std::vector;
  40 using std::string;
  41 using std::map;
  42 using rapidxml::xml_node;
  43 using rapidxml::xml_document;
  44
  45 namespace parsers {
  46
  47
  48 /*!
  49 * Base XML parser class. Loads XML file and build XML tree. Must be subclassed.
  50 * \todo Parser should escape aoi::SEPARATOR_SQL character right after loading
  51 * file into memory (i.e. before parsing).
  52 */
  53 class BaseParser
  54 {
  55   private:
  56     char *buffer_;
  57
  58   protected:
  59     xml_document<> doc_;
  60     std::ifstream file_;
  61
  62     /*!
  63     * Get values of all the elements of the type <i>element</i> in <i>node</i>.
  64     * For example XML code:
  65     \verbatim
  66     <person>
  67       <name>John Doe</name>
  68       <phone>1232456789</phone>
  69       <phone>987654321</phone>
  70     </person>
  71     \endverbatim
  72     * get_elements( node, phone ) returns "{ "123456789", "987654321" }"
  73     * \param parent       parent node
  74     * \param element      what element to get
  75     * \param unreference  if true: remove '&' and ';' from the string borders
  76     * \return values of all the elements <i>element</i> or empty vector
  77     */
  78     static vector<string> get_elements ( xml_node<> *parent, const char *element,
  79                                          bool unreference=false );
  80
  81   public:
  82     BaseParser ( const char *filename );
  83     virtual ~BaseParser ();
  84
  85     /*!
  86     * Scans first node of the document for the entities (<!ENTITY).
  87     * \return map in format entity_name:entity_description
  88     */
  89     map<string,string> get_entities ();
  90 };
  91
  92
  93 //! Parser for JMDict_e XML file.
  94 class JmdictParser : public BaseParser
  95 {
  96   private:
  97     int n_entries_ = 0;
  98     int n_reading_ = 0;
  99     int n_kanji_   = 0;
 100     int n_gloss_   = 0;
 101     int n_sense_   = 0;
 102     xml_node<> *entry_ = nullptr;
 103
 104   public:
 105     JmdictParser( const char *filename ) : BaseParser(filename)
 106       { entry_ = doc_.first_node("JMdict")->first_node("entry"); };
 107     ~JmdictParser() {};
 108
 109     /*!
 110     * Gets one entry from JMdict. Caller should call this function until
 111     * Dicword.did() != -1
 112     \verbatim
 113     JmdictParser p("file.xml");
 114     DicWord w = p.get_entry();
 115     while ( w.did() != -1 ){
 116       printf("Word ID: %d\n", w.did());
 117       w = p.get_entry();
 118     }
 119     \endverbatim
 120     * \return DicWord on succes, empty DicWord (did()=-1) otherwise
 121     */
 122     DicWord get_entry ();
 123
 124     //! Returns JMDict version.
 125     string get_version ();
 126
 127 };
 128
 129
 130 //! Parser for kanjidic2 XML file.
 131 class KanjidicParser : public BaseParser
 132 {
 133   private:
 134     int n_entries_ = 0;
 135     xml_node<> *entry_ = nullptr;
 136
 137   public:
 138     KanjidicParser( const char *filename ): BaseParser(filename)
 139       { entry_ = doc_.first_node("kanjidic2")->first_node("character"); };
 140     ~KanjidicParser(){};
 141
 142     /*!
 143     * Gets one entry from kanjidic2. Caller should call this function until
 144     * Kanji.kanji() != ""
 145     \verbatim
 146     KanjidicParser p("file.xml");
 147     Kanji k = p.get_entry();
 148     while ( !k.kanji().empty() ){
 149       printf("Kanji: %s\n", k.kanji().c_str());
 150       k = p.get_entry();
 151     }
 152     \endverbatim
 153     * \return Kanji on success, empty Kanji (kanji()=="") otherwise
 154     */
 155     Kanji get_entry ();
 156
 157     //! Returns kanjidic2 version in format: "version (date)"
 158     string get_version ();
 159 };
 160
 161
 162 } // namespace parsers
 163 #endif // _PARSERS_HXX