3 A code to convert a .blib file to something can be easily compressible
4 and is not as redundant as the .blib format.
6 Copyright (C) 2008, 2009 Holger Hans Peter Freyther <zecke@openmoko.org>
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
36 print "WARNING: LZO not imported performance will be degraded"
39 def prepare_run(text_runs
):
40 """Count the occurences of fonts, glyphs and position"""
43 text_runs
.sort(textrun
.TextRun
.cmp)
46 def map_glyph(glyphmap
, font_id
, glyph_id
):
48 gen_font_file.py has repaced some glyphs to create a more
49 dense datastructure. We have to apply the glyph remapping
50 for the given font to identify the right font. If the glyph
51 is not remapped we will use the original glyph-id.
54 glyphmap
[font_id
][glyph_id
]
58 def write_to_file(text_runs
, fonts
, glyphmap
, auto_kern_bit
):
60 A function saving the text runs and hoping autokern will do its job
64 def write_pending_bit(output
, font_id
, run
):
66 The text run is sorted by paragrah and all glyphs of
67 one paragraph are on the same line and have roughly the
72 output
.append(",%d," % (run
.first_x
))
74 output
.append(",%d_%d," % (run
.first_x
, run
.first_y
))
77 for glyph
in run
.glyphs
:
78 list.append(map_glyph(glyphmap
, font_id
, glyph
['glyph']))
79 output
.append("-".join(list))
83 for text_run
in text_runs
:
84 # we migt have a new font now
87 output
.append("f%s" % fonts
[font
])
90 write_pending_bit(output
, fonts
[last_font
], text_run
)
92 text
= "".join(output
)
94 compressed
= lzo
.compress(text
)
98 auto_kern_bit
.write(struct
.pack("<I", len(compressed
)))
99 auto_kern_bit
.write(compressed
)
103 parser
= optparse
.OptionParser(version
= "Generate Huffman code utility 0.1",
104 usage
= """%prog [options] input_file
105 Two modes are supported. Single conversion or batch conversion""")
107 parser
.add_option("-f", "--fontmap", help = "specify the fontmap.map to use",
108 action
= "store", dest
= "fontmap", default
= "fontmap.map")
109 parser
.add_option("-g", "--glyphmap", help = "specify the glyphmap.map to use",
110 action
= "store", dest
= "glyphmap", default
= "glyphmap.map")
111 parser
.add_option("-o", "--output", help = "Output file",
112 action
= "store", dest
= "output_file", default
= "huffmaned.cde")
113 parser
.add_option("-b", "--batch", help = "start a batch job",
114 action
= "store_true", dest
= "batch", default
= False)
115 parser
.add_option("-a", "--batch-output", help = "Output file for the batch",
116 action
= "store", dest
= "output_batch_file", default
= "wikipedia.set")
117 parser
.add_option("-c", "--batch-offset", help = "File with offsets of articles",
118 action
= "store", dest
= "output_marker", default
= "wikipedia.offset")
119 parser
.add_option("-e", "--error-file", help = "File where to put errors",
120 action
= "store", dest
= "error_file", default
= "failed_blib.files")
121 parser
.add_option("-j", "--job", help = "specify the job number",
122 action
= "store", dest
= "jobnumber", default
= "1")
123 parser
.add_option("-d", "--dict", help = "No duplicates... generate a pickled dict...",
124 action
= "store", dest
="dictionary", default
= "dictionary.dict")
126 opts
, args
= parser
.parse_args(sys
.argv
)
127 opts
.jobnumber
= int(opts
.jobnumber
)
131 options
, args
= parse()
133 # Import Psyco if available
141 if not options
.batch
:
142 glyphs
= textrun
.load(open(args
[1]))
143 text_runs
= textrun
.generate_text_runs(glyphs
, 240)
144 prepare_run(text_runs
)
145 fontmap
= fontmap
.load(options
.fontmap
)
146 glyphmap
= glyphmap
.load(options
.glyphmap
)
147 auto_kern_bit
= open(options
.output_file
, "w")
148 write_to_file(text_runs
, fontmap
, glyphmap
, auto_kern_bit
)
150 # We got pointed to a list of directories and will collect the
151 # the 'work' files from there and will pick up the objects and then
152 # do some work on it.
153 offset_marker
= open(options
.output_marker
, "w")
154 batch_output
= open(options
.output_batch_file
, "w")
155 fontmap
= fontmap
.load(options
.fontmap
)
156 glyphmap
= glyphmap
.load(options
.glyphmap
)
157 failed
= open(options
.error_file
, "a")
159 dict = pickle
.load(open(options
.dictionary
))
163 def convert(base_name
, hash):
165 Convert a single file
172 file_name
= os
.path
.join(base_name
, "articles", hash[0], hash[1:3], hash)
173 file_name
= "%s.blib" % file_name
174 glyphs
= textrun
.load(open(file_name
, 'rb'))
175 text_runs
= textrun
.generate_text_runs(glyphs
, 240)
176 prepare_run(text_runs
)
178 # write the offset to another file...
179 print >> offset_marker
, \
180 "INSERT INTO Offsets (offset, file, hash) VALUES (%d, %d, '%s');" % (batch_output
.tell(), options
.jobnumber
, hash)
181 write_to_file(text_runs
, fontmap
, glyphmap
, batch_output
)
183 print >> offset_marker
, "BEGIN TRANSACTION;"
184 for arg
in range(1, len(args
)):
185 for work
in glob
.glob(os
.path
.join(args
[arg
], "*.work")):
186 print "Working on %s" % work
189 data
= line
[:-1].split(" ", 1)
191 convert(args
[arg
], data
[0])
193 print >> failed
, "Error: %s from %s" % (data
[0], work
)
194 print >> offset_marker
, "COMMIT;"
197 pickle
.dump(dict, open(options
.dictionary
, "w"))
199 print "Done. Have fun!"