3 /* Messy DOS-specific code for correctly treating binary, Unix text
6 This has several aspects:
8 * Guessing the file type (unless the user tells us);
9 * Stripping CR characters from DOS text files (otherwise regex
10 functions won't work correctly);
11 * Reporting correct byte count with -b for any kind of file.
16 UNKNOWN
, DOS_BINARY
, DOS_TEXT
, UNIX_TEXT
20 off_t pos
; /* position in buffer passed to matcher */
21 off_t add
; /* how much to add when reporting char position */
24 static int dos_report_unix_offset
= 0;
26 static File_type dos_file_type
= UNKNOWN
;
27 static File_type dos_use_file_type
= UNKNOWN
;
28 static off_t dos_stripped_crs
= 0;
29 static struct dos_map
*dos_pos_map
;
30 static int dos_pos_map_size
= 0;
31 static int dos_pos_map_used
= 0;
32 static int inp_map_idx
= 0, out_map_idx
= 1;
34 /* Guess DOS file type by looking at its contents. */
35 static inline File_type
36 guess_type (char *buf
, register size_t buflen
)
39 register char *bp
= buf
;
43 /* Treat a file as binary if it has a NUL character. */
47 /* CR before LF means DOS text file (unless we later see
48 binary characters). */
49 else if (*bp
== '\r' && buflen
&& bp
[1] == '\n')
55 return crlf_seen
? DOS_TEXT
: UNIX_TEXT
;
58 /* Convert external DOS file representation to internal.
59 Return the count of characters left in the buffer.
60 Build table to map character positions when reporting byte counts. */
62 undossify_input (register char *buf
, size_t buflen
)
68 /* New file: forget everything we knew about character
69 position mapping table and file type. */
74 dos_file_type
= dos_use_file_type
;
77 /* Guess if this file is binary, unless we already know that. */
78 if (dos_file_type
== UNKNOWN
)
79 dos_file_type
= guess_type(buf
, buflen
);
81 /* If this file is to be treated as DOS Text, strip the CR characters
82 and maybe build the table for character position mapping on output. */
83 if (dos_file_type
== DOS_TEXT
)
97 if (out_byte
&& !dos_report_unix_offset
)
100 while (buflen
&& *buf
== '\r')
106 if (inp_map_idx
>= dos_pos_map_size
- 1)
108 dos_pos_map_size
= inp_map_idx
? inp_map_idx
* 2 : 1000;
110 (struct dos_map
*)xrealloc((char *)dos_pos_map
,
112 sizeof(struct dos_map
));
117 /* Add sentinel entry. */
118 dos_pos_map
[inp_map_idx
].pos
= 0;
119 dos_pos_map
[inp_map_idx
++].add
= 0;
121 /* Initialize first real entry. */
122 dos_pos_map
[inp_map_idx
].add
= 0;
125 /* Put the new entry. If the stripped CR characters
126 precede a Newline (the usual case), pretend that
127 they were found *after* the Newline. This makes
128 displayed byte offsets more reasonable in some
129 cases, and fits better the intuitive notion that
130 the line ends *before* the CR, not *after* it. */
132 dos_pos_map
[inp_map_idx
-1].pos
=
133 (*buf
== '\n' ? destp
+ 1 : destp
) - bufbeg
+ totalcc
;
134 dos_pos_map
[inp_map_idx
].add
= dos_stripped_crs
;
135 dos_pos_map_used
= inp_map_idx
;
137 /* The following will be updated on the next pass. */
138 dos_pos_map
[inp_map_idx
].pos
= destp
- bufbeg
+ totalcc
+ 1;
149 /* Convert internal byte count into external. */
151 dossified_pos (off_t byteno
)
156 if (dos_file_type
!= DOS_TEXT
|| dos_report_unix_offset
)
159 /* Optimization: usually the file will be scanned sequentially.
160 So in most cases, this byte position will be found in the
161 table near the previous one, as recorded in `out_map_idx'. */
162 pos_lo
= dos_pos_map
[out_map_idx
-1].pos
;
163 pos_hi
= dos_pos_map
[out_map_idx
].pos
;
165 /* If the initial guess failed, search up or down, as
166 appropriate, beginning with the previous place. */
167 if (byteno
>= pos_hi
)
170 while (out_map_idx
< dos_pos_map_used
&&
171 byteno
>= dos_pos_map
[out_map_idx
].pos
)
175 else if (byteno
< pos_lo
)
178 while (out_map_idx
> 1 && byteno
< dos_pos_map
[out_map_idx
-1].pos
)
182 return byteno
+ dos_pos_map
[out_map_idx
].add
;