Fix misleadingly indented statements.
[pspp.git] / src / libpspp / line-reader.c
blobcb4bd014b93ef6b90bd4e75d8649a216be009e2b
1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include "line-reader.h"
21 #include <assert.h>
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <unistd.h>
28 #include "libpspp/assertion.h"
29 #include "libpspp/encoding-guesser.h"
30 #include "libpspp/i18n.h"
31 #include "libpspp/str.h"
33 #include "gl/minmax.h"
34 #include "gl/xalloc.h"
36 enum line_reader_state
38 S_UNIBYTE, /* Known stream encoding, 1-byte unit. */
39 S_MULTIBYTE, /* Known stream encoding, multibyte unit. */
40 S_AUTO /* Encoding autodetection in progress. */
43 struct line_reader
45 int fd;
46 enum line_reader_state state;
47 struct encoding_info encoding_info;
49 char *encoding; /* Current encoding. */
50 char *auto_encoding; /* In S_AUTO mode, user-specified encoding. */
52 char *buffer;
53 char *head;
54 size_t length;
56 int error;
57 bool eof;
60 static ssize_t fill_buffer (struct line_reader *);
62 /* Opens FILENAME, which is encoded in ENCODING, for reading line by line,
63 passing FLAGS to the open() function. Returns a new line_reader if
64 successful, otherwise returns NULL and sets errno to an appropriate value.
66 The accepted forms for ENCODING are listed at the top of
67 encoding-guesser.h. */
68 struct line_reader *
69 line_reader_for_file (const char *encoding, const char *filename, int flags)
71 struct line_reader *r;
72 int fd;
74 assert (!(flags & O_CREAT));
76 fd = open (filename, flags);
77 if (fd < 0)
78 return NULL;
80 r = line_reader_for_fd (encoding, fd);
81 if (r == NULL)
83 int save_errno = errno;
84 close (fd);
85 errno = save_errno;
88 return r;
91 /* Creates and returns a new line_reader that reads its input from FD. Returns
92 a new line_reader if successful, otherwise returns NULL and sets errno to an
93 appropriate value.
95 The accepted forms for ENCODING are listed at the top of
96 encoding-guesser.h. */
97 struct line_reader *
98 line_reader_for_fd (const char *encoding, int fd)
100 struct line_reader *r;
102 r = calloc (1, sizeof *r);
103 if (r == NULL)
104 return NULL;
106 r->fd = fd;
107 r->buffer = malloc (LINE_READER_BUFFER_SIZE);
108 if (r->buffer == NULL)
109 goto error;
110 r->head = r->buffer;
111 r->length = 0;
113 if (fill_buffer (r) < 0)
114 goto error;
116 r->encoding = xstrdup (encoding_guess_head_encoding (
117 encoding, r->buffer, r->length));
118 if (!get_encoding_info (&r->encoding_info, r->encoding))
120 errno = EINVAL;
121 goto error;
124 if (encoding_guess_encoding_is_auto (encoding)
125 && !strcmp (r->encoding, "ASCII"))
127 r->state = S_AUTO;
128 r->auto_encoding = encoding ? xstrdup (encoding) : NULL;
130 else
131 r->state = r->encoding_info.unit == 1 ? S_UNIBYTE : S_MULTIBYTE;
133 return r;
135 error:
136 line_reader_free (r);
137 return NULL;
140 /* Closes R and its underlying file descriptor and frees all associated
141 resources. Returns the return value from close(). */
143 line_reader_close (struct line_reader *r)
145 if (r != NULL)
147 int fd = r->fd;
148 line_reader_free (r);
149 return close (fd);
151 return 0;
154 /* Frees R and associated resources, but does not close the underlying file
155 descriptor. (Thus, the client must close the file descriptor when it is no
156 longer needed.) */
157 void
158 line_reader_free (struct line_reader *r)
160 if (r != NULL)
162 free (r->buffer);
163 free (r->encoding);
164 free (r->auto_encoding);
165 free (r);
169 static ssize_t
170 fill_buffer (struct line_reader *r)
172 ssize_t n;
174 /* Move any unused bytes to the beginning of the input buffer. */
175 if (r->length > 0 && r->buffer != r->head)
176 memmove (r->buffer, r->head, r->length);
177 r->head = r->buffer;
179 /* Read more input. */
182 n = read (r->fd, r->buffer + r->length,
183 LINE_READER_BUFFER_SIZE - r->length);
185 while (n < 0 && errno == EINTR);
186 if (n > 0)
187 r->length += n;
188 else if (n < 0)
189 r->error = errno;
190 else
191 r->eof = true;
192 return n;
195 static void
196 output_bytes (struct line_reader *r, struct string *s, size_t n)
198 ds_put_substring (s, ss_buffer (r->head, n));
199 r->head += n;
200 r->length -= n;
203 static void
204 output_line (struct line_reader *r, struct string *s, size_t n)
206 int unit = r->encoding_info.unit;
208 output_bytes (r, s, n);
210 r->head += unit;
211 r->length -= unit;
213 ds_chomp (s, ss_buffer (r->encoding_info.cr, unit));
216 /* Reads a line of text, but no more than MAX_LENGTH bytes, from R and appends
217 it to S, omitting the final new-line and the carriage return that
218 immediately precedes it, if one is present. The line is left in its
219 original encoding.
221 Returns true if anything was successfully read from the file. (If an empty
222 line was read, then nothing is appended to S.) Returns false if end of file
223 was reached or a read error occurred before any text could be read. */
224 bool
225 line_reader_read (struct line_reader *r, struct string *s, size_t max_length)
227 size_t original_length = ds_length (s);
228 int unit = r->encoding_info.unit;
232 size_t max_out = max_length - (ds_length (s) - original_length);
233 size_t max_in = r->length;
234 size_t max = MIN (max_in, max_out);
235 size_t n;
236 char *p;
238 if (max_out < unit)
239 break;
241 switch (r->state)
243 case S_UNIBYTE:
244 p = memchr (r->head, r->encoding_info.lf[0], max);
245 if (p != NULL)
247 output_line (r, s, p - r->head);
248 return true;
250 n = max;
251 break;
253 case S_MULTIBYTE:
254 for (n = 0; n + unit <= max; n += unit)
255 if (!memcmp (r->head + n, r->encoding_info.lf, unit))
257 output_line (r, s, n);
258 return true;
260 break;
262 case S_AUTO:
263 for (n = 0; n < max; n++)
264 if (!encoding_guess_is_ascii_text (r->head[n]))
266 char *encoding;
268 output_bytes (r, s, n);
269 fill_buffer (r);
270 r->state = S_UNIBYTE;
272 encoding = xstrdup (encoding_guess_tail_encoding (
273 r->auto_encoding, r->head, r->length));
274 free (r->encoding);
275 r->encoding = encoding;
277 free (r->auto_encoding);
278 r->auto_encoding = NULL;
280 n = 0;
281 break;
283 else if (r->head[n] == '\n')
285 output_line (r, s, n);
286 return true;
288 break;
290 default:
291 NOT_REACHED ();
294 output_bytes (r, s, n);
296 while (r->length >= unit || fill_buffer (r) > 0);
298 return ds_length (s) > original_length;
301 /* Returns the file descriptor underlying R. */
303 line_reader_fileno (const struct line_reader *r)
305 return r->fd;
308 /* Returns the offset in the file of the next byte to be read from R, or -1 on
309 error (e.g. if the file is not seekable). */
310 off_t
311 line_reader_tell (const struct line_reader *r)
313 off_t pos = lseek (r->fd, 0, SEEK_CUR);
314 return (pos < 0 ? pos
315 : pos >= r->length ? pos - r->length
316 : 0);
319 /* Returns true if end of file has been encountered reading R. */
320 bool
321 line_reader_eof (const struct line_reader *r)
323 return r->eof && !r->length;
326 /* Returns an nonzero errno value if an error has been encountered reading
327 R, zero otherwise. */
329 line_reader_error (const struct line_reader *r)
331 return !r->length ? r->error : 0;
334 /* Returns the encoding of R. If line_reader_is_auto(R) returns true, the
335 encoding might change as more lines are read. */
336 const char *
337 line_reader_get_encoding (const struct line_reader *r)
339 return r->encoding;
342 /* Returns true if the encoding of the file being read by R is not yet
343 completely known. If this function returns true, then the encoding returned
344 by line_reader_get_encoding() might change as more lines are read (and after
345 the change, this function will return false). */
346 bool
347 line_reader_is_auto (const struct line_reader *r)
349 return r->state == S_AUTO;