2 This is a Optical-Character-Recognition program
3 Copyright (C) 2000-2006 Joerg Schulenburg
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License
7 as published by the Free Software Foundation; either version 2
8 of the License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 see README for EMAIL-address
31 const char *getTextLine (int line
) {
35 if (line
< 0 || line
> list_total(&(JOB
->res
.linelist
)))
38 for ( i
= 0, elem
= JOB
->res
.linelist
.start
.next
; i
< line
&& elem
!= NULL
; i
++ )
42 return (const char *)elem
->data
;
47 void free_textlines(void) {
48 for_each_data(&(JOB
->res
.linelist
)) {
49 if (list_get_current(&(JOB
->res
.linelist
)))
50 free(list_get_current(&(JOB
->res
.linelist
)));
51 } end_for_each(&(JOB
->res
.linelist
));
52 list_free(&(JOB
->res
.linelist
));
55 /* append a string (s1) to the string buffer (buffer) of length (len)
56 * if buffer is to small or len==0 realloc buffer, len+=512
58 char *append_to_line(char *buffer
, const char *s1
, int *len
) {
61 if( s1
==NULL
|| s1
[0] == 0 ){
62 fprintf(stderr
,"\n#BUG: appending 0 to a line makes no sense!");
65 if ( *len
>0 ) slen
= strlen(buffer
); // used buffer
67 if ( slen
+alen
+1 >= *len
) {
69 temp
= (char *)realloc(buffer
, *len
);
70 if( !temp
) { fprintf(stderr
,"realloc failed!\n"); *len
-=512; return buffer
; }
71 else buffer
= temp
; // buffer successfull enlarged
73 temp
= buffer
+ slen
; // end of buffered string
74 memcpy(temp
,s1
,alen
+1); // copy including end sign '\0'
78 int calc_median_gap(struct tlines
* lines
) {
79 int gaps
[MAXlines
], l
;
80 if (lines
->num
<2) return 0;
81 for (l
= 0; l
< lines
->num
- 1; l
++)
82 gaps
[l
] = lines
->m2
[l
+ 1] - lines
->m3
[l
];
83 qsort(gaps
, lines
->num
- 1, sizeof(gaps
[0]), intcompare
);
84 return gaps
[(lines
->num
- 1) / 2];
88 * Return the indent in pixels of the least-indented line.
89 * Will be subtracted as base_indent to avoid negativ indent.
91 * This is adjusted to account for an angle on the page as
92 * a whole. For instance, if the page is rotated clockwise,
93 * lower lines may be physically closer to the left edge
94 * than higher lines that are logically less indented.
95 * We rotate around (0,0). Note that this rotation could
96 * rotate lines "off the left margin", leading to a negative
99 * boxlist -- list of character boxes.
100 * dx, dy -- rotation angle as vector
102 int get_least_line_indent(List
* boxlist
, int dx
, int dy
) {
103 int min_indent
= INT_MAX
;
106 if (JOB
->cfg
.verbose
)
107 fprintf(stderr
, "get_least_line_indent: rot.vector dxdy %d %d\n",
109 for_each_data(boxlist
) {
110 box2
= (struct box
*)list_get_current(boxlist
);
111 /* if num == -1, indicates this is a space or newline box,
112 * inserted in list_insert_spaces. */
113 if (box2
->num
!= -1) {
114 adjusted_indent
= box2
->x0
;
115 if (dx
) adjusted_indent
+= box2
->y0
* dy
/ dx
;
116 if (adjusted_indent
< min_indent
) {
117 min_indent
= adjusted_indent
;
118 if (dy
!=0 && JOB
->cfg
.verbose
)
120 "# Line %2d, unadjusted xy %3d %3d, adjusted x %2d\n",
121 box2
->line
, box2
->x0
, box2
->y0
, adjusted_indent
);
124 } end_for_each(boxlist
);
125 if (JOB
->cfg
.verbose
)
126 fprintf(stderr
, "# Minimum adjusted x: %d (min_indent)\n", min_indent
);
130 /* collect all the chars from the box tree and write them to a string buffer
131 mo is the mode: mode&8 means, use chars even if unsure recognized
132 ToDo: store full text(?), store decoded text+boxes+position chars (v0.4)
133 (HTML,UTF,ASCII,XML), not wchar incl. dexcriptions (at<95% in red)
134 remove decode(*c, job->cfg.out_format) from gocr.c!
135 XML add alternate-tags, format tags and position tags
136 ToDo: better output XML to stdout instead of circumstantial store to lines
137 not all texts/images follow the line concept?
138 Better use a tree of objects where leafes are chars instead of simple list.
139 Chars or objects are taken into account. Objects can be text strings
142 void store_boxtree_lines(int mo
) {
143 char *buffer
; /* temp buffer for text */
145 int len
= 1024; // initial buffer length for text line
148 int max_single_space_gap
= 0;
149 struct tlines line_info
;
150 int line
, line_gap
, oldline
=-1;
154 buffer
= (char *)malloc(len
);
156 fprintf(stderr
,"malloc failed!\n"); // ToDo: index_to_error_list
161 if ( JOB
->cfg
.verbose
&1 )
162 fprintf(stderr
,"# store boxtree to lines ...");
164 /* wew: calculate the median line gap, to determine line spacing
165 * for the text output. The line gap used is between one line's
166 * m3 (baseline) and the next line's m2 (height of non-rising
167 * lowercase). We use these lines as they are the least likely
168 * to vary according to actual character content of lines.
170 median_gap
= calc_median_gap(&JOB
->res
.lines
);
171 if (median_gap
<= 0) {
172 fprintf(stderr
, "# Warning: non-positive median line gap of %d\n",
175 max_single_space_gap
= 12; /* arbitrary */
177 max_single_space_gap
= median_gap
* 7 / 4;
180 // Will be subtracted as base_indent to avoid negativ indent.
181 left_margin
= get_least_line_indent(&JOB
->res
.boxlist
,
185 if (JOB
->cfg
.out_format
==XML
) { /* subject of change */
186 char s1
[255]; /* ToDo: avoid potential buffer overflow !!! */
187 /* output lot of usefull information for XML filter */
188 sprintf(s1
,"<page x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\">\n",
190 buffer
=append_to_line(buffer
,s1
,&len
);
191 sprintf(s1
,"<block x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\">\n",
193 buffer
=append_to_line(buffer
,s1
,&len
);
196 for_each_data(&(JOB
->res
.boxlist
)) {
197 box2
= (struct box
*)list_get_current(&(JOB
->res
.boxlist
));
199 line_info
= JOB
->res
.lines
;
200 /* reset the output char if certainty is below the limit v0.44 */
201 if (box2
->num_ac
&& box2
->wac
[0]<JOB
->cfg
.certainty
) box2
->c
=UNKNOWN
;
203 if (JOB
->cfg
.out_format
==XML
&& oldline
>-1) { /* subject of change */
204 buffer
=append_to_line(buffer
,"</line>\n",&len
);
205 list_app( &(JOB
->res
.linelist
), (void *)strdup(buffer
) ); // wcsdup
206 memset(buffer
, 0, len
);
207 j
=0; // reset counter for new line
209 if (JOB
->cfg
.out_format
==XML
) { /* subject of change */
210 char s1
[255]; /* ToDo: avoid potential buffer overflow !!! */
211 /* output lot of usefull information for XML filter */
212 sprintf(s1
,"<line x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" value=\"%d\">\n",
213 line_info
.x0
[line
],line_info
.m1
[line
],
214 line_info
.x1
[line
]-line_info
.x0
[line
]+1,
215 line_info
.m4
[line
]-line_info
.m1
[line
],line
);
216 buffer
=append_to_line(buffer
,s1
,&len
);
221 box2
->c
<= 'z') i1
++; /* count non-space chars */
222 if (box2
->c
== '\n') {
223 if (JOB
->cfg
.out_format
!=XML
) { /* subject of change */
224 line_info
= JOB
->res
.lines
;
227 line_gap
= line_info
.m2
[line
] - line_info
.m3
[line
- 1];
228 for (line_gap
-= max_single_space_gap
; line_gap
> 0;
229 line_gap
-= median_gap
) {
230 buffer
=append_to_line(buffer
,"\n",&len
);
231 j
++; /* count chars in line */
234 list_app( &(JOB
->res
.linelist
), (void *)strdup(buffer
) ); // wcsdup
235 memset(buffer
, 0, len
);
236 j
=0; // reset counter for new line
239 if (box2
->c
== ' ') // fill large gaps with spaces
241 if (JOB
->res
.avX
) { /* avoid SIGFPE */
242 if (JOB
->cfg
.out_format
==XML
) { /* subject of change */
243 char s1
[255]; /* ToDo: avoid potential buffer overflow !!! */
244 /* output lot of usefull information for XML filter */
245 sprintf(s1
," <space x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" />\n",
246 box2
->x0
,box2
->y0
,box2
->x1
-box2
->x0
+1,box2
->y1
-box2
->y0
+1);
247 buffer
=append_to_line(buffer
,s1
,&len
);
249 for (i
= (box2
->x1
- box2
->x0
) / (2 * JOB
->res
.avX
) + 1; i
> 0; i
--) {
250 buffer
=append_to_line(buffer
," ",&len
);
255 else if (box2
->c
!= '\n') {
256 if (j
==0 && JOB
->res
.avX
) /* first char in new line? */ {
257 int indent
= box2
->x0
- JOB
->res
.lines
.x0
[box2
->line
];
258 /* correct for angle of page as a whole. */
259 if (JOB
->res
.lines
.dx
)
260 indent
+= box2
->y0
* JOB
->res
.lines
.dy
/ JOB
->res
.lines
.dx
;
261 /* subtract the base margin. */
262 indent
-= left_margin
;
263 if (JOB
->cfg
.out_format
==XML
) { /* subject of change */
264 char s1
[255]; /* ToDo: avoid potential buffer overflow !!! */
265 /* output lot of usefull information for XML filter */
266 sprintf(s1
," <space x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" />\n",
267 box2
->x0
,box2
->y0
,box2
->x1
-box2
->x0
+1,box2
->y1
-box2
->y0
+1);
268 buffer
=append_to_line(buffer
,s1
,&len
);
270 for (i
= indent
/ JOB
->res
.avX
; i
> 0; i
--) {
271 buffer
=append_to_line(buffer
," ",&len
); j
++;
274 if (JOB
->cfg
.out_format
==XML
) { /* subject of change */
275 char s1
[255]; /* ToDo: avoid potential buffer overflow !!! */
276 /* output lot of usefull information for XML filter */
277 sprintf(s1
," <box x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" value=\"",
278 box2
->x0
,box2
->y0
,box2
->x1
-box2
->x0
+1,box2
->y1
-box2
->y0
+1);
279 buffer
=append_to_line(buffer
,s1
,&len
);
280 if (box2
->num_ac
>1) { /* output alist */
283 if (box2
->c
!= UNKNOWN
&& box2
->c
!=0) {
285 append_to_line(buffer
,decode(box2
->c
,JOB
->cfg
.out_format
),&len
);
287 box2
->c
<= 'z') i2
++; /* count non-space chars */
289 wchar_t cc
; cc
=box2
->c
;
290 if (box2
->num_ac
>0 && box2
->tas
[0]
291 && (JOB
->cfg
.out_format
!=XML
|| box2
->tas
[0][0]!='<')) {
292 buffer
=append_to_line(buffer
,box2
->tas
[0],&len
);
293 j
+=strlen(box2
->tas
[0]);
296 append_to_line(buffer
,decode(cc
,JOB
->cfg
.out_format
),&len
);
299 if (JOB
->cfg
.out_format
==XML
) {
300 if (box2
->num_ac
>0) {
301 /* output alist ToDo: separate <altbox ...> */
302 int i1
; char s1
[256];
303 sprintf(s1
,"\" numac=\"%d\" weights=\"",box2
->num_ac
);
304 buffer
=append_to_line(buffer
,s1
,&len
);
305 for (i1
=0;i1
<box2
->num_ac
;i1
++) {
306 sprintf(s1
,"%d",box2
->wac
[i1
]);
307 buffer
=append_to_line(buffer
,s1
,&len
);
308 if (i1
+1<box2
->num_ac
) buffer
=append_to_line(buffer
,",",&len
);
311 buffer
=append_to_line(buffer
,"\" achars=\"",&len
);
312 for (i1
=1;i1
<box2
->num_ac
;i1
++) {
313 if (box2
->tas
[i1
] && box2
->tas
[i1
][0]!='<')
314 buffer
=append_to_line(buffer
,box2
->tas
[i1
],&len
);
316 buffer
=append_to_line(buffer
,
317 decode(box2
->tac
[i1
],JOB
->cfg
.out_format
),&len
);
318 // ToDo: add tas[] (achars->avalues or alternate_strings?
319 if (i1
+1<box2
->num_ac
) buffer
=append_to_line(buffer
,",",&len
);
322 buffer
=append_to_line(buffer
,"\" />\n",&len
);
324 if (box2
->num_ac
&& box2
->tas
[0]) {
325 if (box2
->tas
[0][0]=='<') { /* output special XML object */
326 buffer
=append_to_line(buffer
,box2
->tas
[0],&len
);
327 buffer
=append_to_line(buffer
,"\n",&len
);
328 j
+=strlen(box2
->tas
[0]);
334 } end_for_each(&(JOB
->res
.boxlist
));
335 if (JOB
->cfg
.out_format
==XML
&& oldline
>-1) { /* subject of change */
336 buffer
=append_to_line(buffer
,"</line>\n",&len
);
338 if (JOB
->cfg
.out_format
==XML
) { /* subject of change */
339 buffer
=append_to_line(buffer
,"</block>\n</page>\n",&len
);
342 /* do not forget last line */
343 // is there no \n in the last line? If there is, delete next line.
344 list_app( &(JOB
->res
.linelist
), (void *)strdup(buffer
) );
346 if( JOB
->cfg
.verbose
&1 )
347 fprintf(stderr
,"... %d lines, boxes= %d, chars= %d\n",i
,i1
,i2
);