2 Copyright 2013 Karel Matas
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 #include "parsers.hxx"
25 using utils::to_string
;
27 //////////////////////////////////////////////////////////////////////////
30 vector
<string
> BaseParser::get_elements ( xml_node
<> *parent
, const char *element
, bool unreference
)
33 auto *elt
= parent
->first_node(element
);
35 string s
= elt
->value();
37 size_t start
= (s
.front()=='&'); // erase & from the beggining of elt
38 size_t end
= s
.size() - start
- (s
.back()==';'); // erase ; from the end of elt
39 s
= string(s
,start
,end
);
43 elt
= elt
->next_sibling(element
);
48 //////////////////////////////////////////////////////////////////////////
50 BaseParser::BaseParser ( const char *filename
)
52 file_
.open( filename
);
54 if ( !file_
.is_open() )
55 throw utils::CantOpenFile(filename
);
58 file_
.seekg( 0, file_
.end
);
59 int size
= file_
.tellg();
60 file_
.seekg( 0, file_
.beg
);
64 ss
<< "Source file has zero length. (" << filename
<< ")";
65 throw utils::ParsingError( ss
.str() );
69 buffer_
= new char[size
+1];
70 file_
.read( buffer_
, size
);
71 int read
= file_
.gcount();
74 throw utils::ParsingError("Only " + std::to_string(read
) + " bytes was read.");
76 // DOCTYPE node contains ENTITY definitions
77 doc_
.parse
<rapidxml::parse_doctype_node
>( buffer_
);
80 BaseParser::~BaseParser ()
87 map
<string
,string
> BaseParser::get_entities ()
90 auto *node
= doc_
.first_node();
92 if ( node
->type() == rapidxml::node_doctype
){
93 char *pos
= strstr( node
->value(), "<!ENTITY " );
97 pos
+= 9; // strlen("<!ENTITY ")
99 while ( *pos
!= ' ' ) buff
[i
++] = *pos
++;
103 while ( *pos
== ' ' ) pos
++;
104 // entity description
106 while ( *pos
!= '>' ) buff
[i
++] = *pos
++;
108 m
[abbr
] = string(buff
);
109 pos
= strstr( pos
, "<!ENTITY" );
112 node
= node
->next_sibling();
117 //////////////////////////////////////////////////////////////////////////
120 string
JmdictParser::get_version ()
124 file_
.seekg( 0, file_
.beg
);
125 while ( file_
.good() && n
< 1000 ){
130 return s
+ " " + ver
;
137 DicWord
JmdictParser::get_entry ()
145 vector
<ElementKanji
> vk_ele
;
146 vector
<ElementReading
> vr_ele
;
147 vector
<ElementSense
> vs_ele
;
150 did
= std::stoi( entry_
->first_node("ent_seq")->value() );
152 // KANJI (element k_ele)
153 // Elements of interest (name, count, description):
155 // ke_inf 0+ informations about keb
156 // ke_pri 0+ if not empty: keb can be considered "common" or "frequent"
157 auto k_ele
= entry_
->first_node("k_ele");
158 while ( k_ele
!= 0 ){
159 auto keb
= k_ele
->first_node("keb");
160 auto ke_inf
= get_elements( k_ele
, "ke_inf", true );
161 bool freq
= ( k_ele
->first_node("ke_pri") != 0 );
162 vk_ele
.push_back( { n_kanji_
++, keb
->value(), ke_inf
, freq
} );
163 k_ele
= k_ele
->next_sibling("k_ele");
166 // READING (element r_ele)
167 // Elements of interest (name, count, description):
169 // re_nokanji 0-1 if 1: reb cannot be regarded as a true reading of the kanji
170 // re_restr 0+ reb only applies for this keb
171 // re_inf 0+ informations about reb
172 // re_pri 0+ if not empty: reb can be considered "common" or "frequent"
173 auto r_ele
= entry_
->first_node("r_ele");
174 while ( r_ele
!= 0 ){
175 auto reb
= r_ele
->first_node("reb");
176 auto re_restr
= get_elements( r_ele
, "re_restr" );
177 auto re_inf
= get_elements( r_ele
, "re_inf", true );
178 bool re_nokanji
= ( r_ele
->first_node("re_nokanji") != 0 );
179 bool freq
= ( r_ele
->first_node("re_pri") != 0 );
180 vr_ele
.push_back( {n_reading_
++, reb
->value(), re_nokanji
, re_restr
, re_inf
, freq
} );
181 r_ele
= r_ele
->next_sibling("r_ele");
184 // SENSE (element r_ele)
185 // Elements of interest (name, count, description):
196 auto *sense
= entry_
->first_node("sense");
197 while ( sense
!= 0 ){
198 auto gloss
= get_elements( sense
, "gloss" );
199 auto stagk
= get_elements( sense
, "stagk" );
200 auto stagr
= get_elements( sense
, "stagr" );
201 auto pos
= get_elements( sense
, "pos", true );
202 auto xref
= get_elements( sense
, "xref" );
203 auto ant
= get_elements( sense
, "ant" );
204 auto field
= get_elements( sense
, "field", true );
205 auto misc
= get_elements( sense
, "misc", true );
206 auto dial
= get_elements( sense
, "dial", true );
207 auto s_inf
= get_elements( sense
, "s_inf" );
208 vs_ele
.push_back( {n_sense_
++,gloss
,stagk
,stagr
,pos
,xref
,ant
,field
,misc
,dial
,s_inf
} );
209 n_gloss_
+= gloss
.size();
210 sense
= sense
->next_sibling("sense");
212 entry_
= entry_
->next_sibling("entry");
215 return {did
,vk_ele
,vr_ele
,vs_ele
};
218 //////////////////////////////////////////////////////////////////////////
221 string
KanjidicParser::get_version ()
223 auto header
= doc_
.first_node("kanjidic2")->first_node("header");
224 const char *version
= header
->first_node("database_version")->value();
225 const char *date
= header
->first_node("date_of_creation")->value();
226 std::stringstream ss
;
227 ss
<< version
<< " (" << date
<< ")";
231 Kanji
KanjidicParser::get_entry ()
239 k
.kanji ( entry_
->first_node("literal")->value() );
241 auto *cp_value
= entry_
->first_node("codepoint")->first_node("cp_value");
243 auto *type
= cp_value
->first_attribute("cp_type")->value();
244 if ( !strcmp( type
, "ucs" ) )
245 k
.ucs ( cp_value
->value() );
246 else if ( !strncmp( type
, "jis", 3 ) )
248 cp_value
= cp_value
->next_sibling();
251 auto *rad_value
= entry_
->first_node("radical")->first_node("rad_value");
253 auto *type
= rad_value
->first_attribute("rad_type")->value();
254 if ( !strcmp( type
, "classical" ) )
255 k
.rad_classic( rad_value
->value() );
256 else if ( !strcmp( type
, "nelson_c" ) )
257 k
.rad_nelson( rad_value
->value() );
258 rad_value
= rad_value
->next_sibling();
262 auto *query_code
= entry_
->first_node("query_code");
264 auto *q_code
= query_code
->first_node("q_code");
266 auto *type
= q_code
->first_attribute("qc_type")->value();
267 auto *misclass
= q_code
->first_attribute("skip_misclass");
268 if ( !strcmp( type
, "skip" ) ){
269 vector
<int> v
= utils::split_string_int(q_code
->value(),"-");
270 if ( v
.size() != 3 ){
272 snprintf( msg
, 512, "Kanjiparser: Wrong SKIP %s. (U+%s)",
273 q_code
->value(), k
.ucs().c_str());
274 throw utils::ParsingError(msg
);
276 k
.skip( v
, misclass
? misclass
->value():0 );
278 q_code
= q_code
->next_sibling();
282 auto *misc
= entry_
->first_node("misc");
283 auto *freq
= misc
->first_node("freq");
284 auto *jlpt
= misc
->first_node("jlpt");
285 auto *grade
= misc
->first_node("grade");
286 k
.strokes( misc
->first_node("stroke_count")->value() );
287 if ( freq
) k
.freq( freq
->value() );
288 if ( jlpt
) k
.jlpt( atoi(jlpt
->value()) );
289 if ( grade
) k
.grade( atoi(grade
->value()) );
291 auto *reading_meaning
= entry_
->first_node("reading_meaning");
292 if ( reading_meaning
){
293 auto *rmgroup
= reading_meaning
->first_node("rmgroup");
295 auto *reading
= rmgroup
->first_node("reading");
297 auto *type
= reading
->first_attribute("r_type");
299 if ( !strcmp( type
->value(), "ja_on") )
300 k
.onyomi( reading
->value() );
301 else if ( !strcmp( type
->value(), "ja_kun") )
302 k
.kunyomi( reading
->value() );
304 reading
= reading
->next_sibling();
306 auto *meaning
= rmgroup
->first_node("meaning");
308 auto *fattr
= meaning
->first_attribute("m_lang");
309 if ( !fattr
) // english meaning has no m_lang attribute
310 k
.meaning( meaning
->value() );
311 meaning
= meaning
->next_sibling();
314 k
.nanori( get_elements( reading_meaning
, "nanori") );
318 entry_
= entry_
->next_sibling("character");
325 } // namespace parsers