better handling of broken fonts
[swftools.git] / lib / gocr / lines.c
blob396000dd381f14a230144104749293876975914d
1 /*
2 This is a Optical-Character-Recognition program
3 Copyright (C) 2000-2006 Joerg Schulenburg
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License
7 as published by the Free Software Foundation; either version 2
8 of the License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 see README for EMAIL-address
22 #include <stdlib.h>
23 #include <stdio.h>
24 #include <string.h>
25 #include <limits.h>
26 #include <assert.h>
27 #include "pgm2asc.h"
28 #include "gocr.h"
29 #include "unicode.h"
31 const char *getTextLine (int line) {
32 int i;
33 Element *elem;
35 if (line < 0 || line > list_total(&(JOB->res.linelist)))
36 return NULL;
38 for ( i = 0, elem = JOB->res.linelist.start.next; i < line && elem != NULL; i++ )
39 elem = elem->next;
41 if ( elem != NULL )
42 return (const char *)elem->data;
44 return NULL;
47 void free_textlines(void) {
48 for_each_data(&(JOB->res.linelist)) {
49 if (list_get_current(&(JOB->res.linelist)))
50 free(list_get_current(&(JOB->res.linelist)));
51 } end_for_each(&(JOB->res.linelist));
52 list_free(&(JOB->res.linelist));
55 /* append a string (s1) to the string buffer (buffer) of length (len)
56 * if buffer is to small or len==0 realloc buffer, len+=512
58 char *append_to_line(char *buffer, const char *s1, int *len) {
59 char *temp;
60 int slen=0, alen;
61 if( s1==NULL || s1[0] == 0 ){
62 fprintf(stderr,"\n#BUG: appending 0 to a line makes no sense!");
63 return buffer;
65 if ( *len>0 ) slen= strlen(buffer); // used buffer
66 alen = strlen(s1);
67 if ( slen+alen+1 >= *len ) {
68 *len+=512;
69 temp = (char *)realloc(buffer, *len);
70 if( !temp ) { fprintf(stderr,"realloc failed!\n"); *len-=512; return buffer; }
71 else buffer = temp; // buffer successfull enlarged
73 temp = buffer + slen; // end of buffered string
74 memcpy(temp,s1,alen+1); // copy including end sign '\0'
75 return buffer;
78 int calc_median_gap(struct tlines * lines) {
79 int gaps[MAXlines], l;
80 if (lines->num<2) return 0;
81 for (l = 0; l < lines->num - 1; l++)
82 gaps[l] = lines->m2[l + 1] - lines->m3[l];
83 qsort(gaps, lines->num - 1, sizeof(gaps[0]), intcompare);
84 return gaps[(lines->num - 1) / 2];
88 * Return the indent in pixels of the least-indented line.
89 * Will be subtracted as base_indent to avoid negativ indent.
91 * This is adjusted to account for an angle on the page as
92 * a whole. For instance, if the page is rotated clockwise,
93 * lower lines may be physically closer to the left edge
94 * than higher lines that are logically less indented.
95 * We rotate around (0,0). Note that this rotation could
96 * rotate lines "off the left margin", leading to a negative
97 * indent.
99 * boxlist -- list of character boxes.
100 * dx, dy -- rotation angle as vector
102 int get_least_line_indent(List * boxlist, int dx, int dy) {
103 int min_indent = INT_MAX;
104 int adjusted_indent;
105 struct box * box2;
106 if (JOB->cfg.verbose)
107 fprintf(stderr, "get_least_line_indent: rot.vector dxdy %d %d\n",
108 dx, dy);
109 for_each_data(boxlist) {
110 box2 = (struct box *)list_get_current(boxlist);
111 /* if num == -1, indicates this is a space or newline box,
112 * inserted in list_insert_spaces. */
113 if (box2->num != -1) {
114 adjusted_indent = box2->x0;
115 if (dx) adjusted_indent += box2->y0 * dy / dx;
116 if (adjusted_indent < min_indent) {
117 min_indent = adjusted_indent;
118 if (dy!=0 && JOB->cfg.verbose)
119 fprintf(stderr,
120 "# Line %2d, unadjusted xy %3d %3d, adjusted x %2d\n",
121 box2->line, box2->x0, box2->y0, adjusted_indent);
124 } end_for_each(boxlist);
125 if (JOB->cfg.verbose)
126 fprintf(stderr, "# Minimum adjusted x: %d (min_indent)\n", min_indent);
127 return min_indent;
130 /* collect all the chars from the box tree and write them to a string buffer
131 mo is the mode: mode&8 means, use chars even if unsure recognized
132 ToDo: store full text(?), store decoded text+boxes+position chars (v0.4)
133 (HTML,UTF,ASCII,XML), not wchar incl. dexcriptions (at<95% in red)
134 remove decode(*c, job->cfg.out_format) from gocr.c!
135 XML add alternate-tags, format tags and position tags
136 ToDo: better output XML to stdout instead of circumstantial store to lines
137 not all texts/images follow the line concept?
138 Better use a tree of objects where leafes are chars instead of simple list.
139 Chars or objects are taken into account. Objects can be text strings
140 or XML strings.
142 void store_boxtree_lines(int mo) {
143 char *buffer; /* temp buffer for text */
144 int i = 0, j = 0;
145 int len = 1024; // initial buffer length for text line
146 struct box *box2;
147 int median_gap = 0;
148 int max_single_space_gap = 0;
149 struct tlines line_info;
150 int line, line_gap, oldline=-1;
151 int left_margin;
152 int i1=0, i2=0;
154 buffer = (char *)malloc(len);
155 if ( !buffer ) {
156 fprintf(stderr,"malloc failed!\n"); // ToDo: index_to_error_list
157 return;
159 *buffer = 0;
161 if ( JOB->cfg.verbose&1 )
162 fprintf(stderr,"# store boxtree to lines ...");
164 /* wew: calculate the median line gap, to determine line spacing
165 * for the text output. The line gap used is between one line's
166 * m3 (baseline) and the next line's m2 (height of non-rising
167 * lowercase). We use these lines as they are the least likely
168 * to vary according to actual character content of lines.
170 median_gap = calc_median_gap(&JOB->res.lines);
171 if (median_gap <= 0) {
172 fprintf(stderr, "# Warning: non-positive median line gap of %d\n",
173 median_gap);
174 median_gap = 8;
175 max_single_space_gap = 12; /* arbitrary */
176 } else {
177 max_single_space_gap = median_gap * 7 / 4;
180 // Will be subtracted as base_indent to avoid negativ indent.
181 left_margin = get_least_line_indent(&JOB->res.boxlist,
182 JOB->res.lines.dx,
183 JOB->res.lines.dy);
185 if (JOB->cfg.out_format==XML) { /* subject of change */
186 char s1[255]; /* ToDo: avoid potential buffer overflow !!! */
187 /* output lot of usefull information for XML filter */
188 sprintf(s1,"<page x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\">\n",
189 0,0,0,0);
190 buffer=append_to_line(buffer,s1,&len);
191 sprintf(s1,"<block x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\">\n",
192 0,0,0,0);
193 buffer=append_to_line(buffer,s1,&len);
196 for_each_data(&(JOB->res.boxlist)) {
197 box2 = (struct box *)list_get_current(&(JOB->res.boxlist));
198 line = box2->line;
199 line_info = JOB->res.lines;
200 /* reset the output char if certainty is below the limit v0.44 */
201 if (box2->num_ac && box2->wac[0]<JOB->cfg.certainty) box2->c=UNKNOWN;
202 if (line!=oldline) {
203 if (JOB->cfg.out_format==XML && oldline>-1) { /* subject of change */
204 buffer=append_to_line(buffer,"</line>\n",&len);
205 list_app( &(JOB->res.linelist), (void *)strdup(buffer) ); // wcsdup
206 memset(buffer, 0, len);
207 j=0; // reset counter for new line
209 if (JOB->cfg.out_format==XML) { /* subject of change */
210 char s1[255]; /* ToDo: avoid potential buffer overflow !!! */
211 /* output lot of usefull information for XML filter */
212 sprintf(s1,"<line x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" value=\"%d\">\n",
213 line_info.x0[line],line_info.m1[line],
214 line_info.x1[line]-line_info.x0[line]+1,
215 line_info.m4[line]-line_info.m1[line],line);
216 buffer=append_to_line(buffer,s1,&len);
218 oldline=line;
220 if (box2->c > ' ' &&
221 box2->c <= 'z') i1++; /* count non-space chars */
222 if (box2->c == '\n') {
223 if (JOB->cfg.out_format!=XML) { /* subject of change */
224 line_info = JOB->res.lines;
225 line = box2->line;
226 if (line > 0) {
227 line_gap = line_info.m2[line] - line_info.m3[line - 1];
228 for (line_gap -= max_single_space_gap; line_gap > 0;
229 line_gap -= median_gap) {
230 buffer=append_to_line(buffer,"\n",&len);
231 j++; /* count chars in line */
234 list_app( &(JOB->res.linelist), (void *)strdup(buffer) ); // wcsdup
235 memset(buffer, 0, len);
236 j=0; // reset counter for new line
239 if (box2->c == ' ') // fill large gaps with spaces
241 if (JOB->res.avX) { /* avoid SIGFPE */
242 if (JOB->cfg.out_format==XML) { /* subject of change */
243 char s1[255]; /* ToDo: avoid potential buffer overflow !!! */
244 /* output lot of usefull information for XML filter */
245 sprintf(s1," <space x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" />\n",
246 box2->x0,box2->y0,box2->x1-box2->x0+1,box2->y1-box2->y0+1);
247 buffer=append_to_line(buffer,s1,&len);
248 } else
249 for (i = (box2->x1 - box2->x0) / (2 * JOB->res.avX) + 1; i > 0; i--) {
250 buffer=append_to_line(buffer," ",&len);
251 j++;
255 else if (box2->c != '\n') {
256 if (j==0 && JOB->res.avX) /* first char in new line? */ {
257 int indent = box2->x0 - JOB->res.lines.x0[box2->line];
258 /* correct for angle of page as a whole. */
259 if (JOB->res.lines.dx)
260 indent += box2->y0 * JOB->res.lines.dy / JOB->res.lines.dx;
261 /* subtract the base margin. */
262 indent -= left_margin;
263 if (JOB->cfg.out_format==XML) { /* subject of change */
264 char s1[255]; /* ToDo: avoid potential buffer overflow !!! */
265 /* output lot of usefull information for XML filter */
266 sprintf(s1," <space x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" />\n",
267 box2->x0,box2->y0,box2->x1-box2->x0+1,box2->y1-box2->y0+1);
268 buffer=append_to_line(buffer,s1,&len);
269 } else
270 for (i = indent / JOB->res.avX; i > 0; i--) {
271 buffer=append_to_line(buffer," ",&len); j++;
274 if (JOB->cfg.out_format==XML) { /* subject of change */
275 char s1[255]; /* ToDo: avoid potential buffer overflow !!! */
276 /* output lot of usefull information for XML filter */
277 sprintf(s1," <box x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" value=\"",
278 box2->x0,box2->y0,box2->x1-box2->x0+1,box2->y1-box2->y0+1);
279 buffer=append_to_line(buffer,s1,&len);
280 if (box2->num_ac>1) { /* output alist */
283 if (box2->c != UNKNOWN && box2->c!=0) {
284 buffer=
285 append_to_line(buffer,decode(box2->c,JOB->cfg.out_format),&len);
286 if (box2->c > ' ' &&
287 box2->c <= 'z') i2++; /* count non-space chars */
288 } else {
289 wchar_t cc; cc=box2->c;
290 if (box2->num_ac>0 && box2->tas[0]
291 && (JOB->cfg.out_format!=XML || box2->tas[0][0]!='<')) {
292 buffer=append_to_line(buffer,box2->tas[0],&len);
293 j+=strlen(box2->tas[0]);
294 } else {
295 buffer=
296 append_to_line(buffer,decode(cc,JOB->cfg.out_format),&len);
299 if (JOB->cfg.out_format==XML) {
300 if (box2->num_ac>0) {
301 /* output alist ToDo: separate <altbox ...> */
302 int i1; char s1[256];
303 sprintf(s1,"\" numac=\"%d\" weights=\"",box2->num_ac);
304 buffer=append_to_line(buffer,s1,&len);
305 for (i1=0;i1<box2->num_ac;i1++) {
306 sprintf(s1,"%d",box2->wac[i1]);
307 buffer=append_to_line(buffer,s1,&len);
308 if (i1+1<box2->num_ac) buffer=append_to_line(buffer,",",&len);
310 if (box2->num_ac>1)
311 buffer=append_to_line(buffer,"\" achars=\"",&len);
312 for (i1=1;i1<box2->num_ac;i1++) {
313 if (box2->tas[i1] && box2->tas[i1][0]!='<')
314 buffer=append_to_line(buffer,box2->tas[i1],&len);
315 else
316 buffer=append_to_line(buffer,
317 decode(box2->tac[i1],JOB->cfg.out_format),&len);
318 // ToDo: add tas[] (achars->avalues or alternate_strings?
319 if (i1+1<box2->num_ac) buffer=append_to_line(buffer,",",&len);
322 buffer=append_to_line(buffer,"\" />\n",&len);
324 if (box2->num_ac && box2->tas[0]) {
325 if (box2->tas[0][0]=='<') { /* output special XML object */
326 buffer=append_to_line(buffer,box2->tas[0],&len);
327 buffer=append_to_line(buffer,"\n",&len);
328 j+=strlen(box2->tas[0]);
331 j++;
333 i++;
334 } end_for_each(&(JOB->res.boxlist));
335 if (JOB->cfg.out_format==XML && oldline>-1) { /* subject of change */
336 buffer=append_to_line(buffer,"</line>\n",&len);
338 if (JOB->cfg.out_format==XML) { /* subject of change */
339 buffer=append_to_line(buffer,"</block>\n</page>\n",&len);
342 /* do not forget last line */
343 // is there no \n in the last line? If there is, delete next line.
344 list_app( &(JOB->res.linelist), (void *)strdup(buffer) );
345 free(buffer);
346 if( JOB->cfg.verbose&1 )
347 fprintf(stderr,"... %d lines, boxes= %d, chars= %d\n",i,i1,i2);