first version upgrade
[devspec.git] / devspec.en_US / project / recutils / libcsv / libcsv.c
blob7ba6403d407b7a205d4ed2c98a88cbbd8d82c2f8
1 /*
2 * libcsv - parse and write csv data
4 * Original Author: Robert Gamble.
6 * Copyright (C) 2008 Robert Gamble
7 * Copyright (C) 2010-2015 Jose E. Marchesi
8 */
10 /* This program is free software: you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License version
12 * 3 as published by the Free Software Foundation.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program. If not, see <http://www.gnu.org/licenses/>.
23 #if ___STDC_VERSION__ >= 199901L
24 # include <stdint.h>
25 #else
26 # define SIZE_MAX ((size_t)-1) /* C89 doesn't have stdint.h or SIZE_MAX */
27 #endif
29 #include <config.h>
31 #include "csv.h"
33 #define LIBCSV_VERSION "3.0.0-recutils"
35 #define ROW_NOT_BEGUN 0
36 #define FIELD_NOT_BEGUN 1
37 #define FIELD_BEGUN 2
38 #define FIELD_MIGHT_HAVE_ENDED 3
41 Explanation of states
42 ROW_NOT_BEGUN There have not been any fields encountered for this row
43 FIELD_NOT_BEGUN There have been fields but we are currently not in one
44 FIELD_BEGUN We are in a field
45 FIELD_MIGHT_HAVE_ENDED
46 We encountered a double quote inside a quoted field, the
47 field is either ended or the quote is literal
50 #define MEM_BLK_SIZE 128
52 #define SUBMIT_FIELD(p) \
53 do { \
54 if (!quoted) \
55 entry_pos -= spaces; \
56 if (p->options & CSV_APPEND_NULL) \
57 ((p)->entry_buf[entry_pos+1]) = '\0'; \
58 if (cb1) \
59 cb1(p->entry_buf, entry_pos, data); \
60 pstate = FIELD_NOT_BEGUN; \
61 entry_pos = quoted = spaces = 0; \
62 } while (0)
64 #define SUBMIT_ROW(p, c) \
65 do { \
66 if (cb2) \
67 cb2(c, data); \
68 pstate = ROW_NOT_BEGUN; \
69 entry_pos = quoted = spaces = 0; \
70 } while (0)
72 #define SUBMIT_CHAR(p, c) ((p)->entry_buf[entry_pos++] = (c))
74 static char *csv_errors[] = {"success",
75 "error parsing data while strict checking enabled",
76 "memory exhausted while increasing buffer size",
77 "data size too large",
78 "invalid status code"};
80 int
81 csv_error(struct csv_parser *p)
83 /* Return the current status of the parser */
84 return p->status;
87 char *
88 csv_strerror(int status)
90 /* Return a textual description of status */
91 if (status >= CSV_EINVALID || status < 0)
92 return csv_errors[CSV_EINVALID];
93 else
94 return csv_errors[status];
97 int
98 csv_get_opts(struct csv_parser *p)
100 /* Return the currently set options of parser */
101 if (p == NULL)
102 return -1;
104 return p->options;
108 csv_set_opts(struct csv_parser *p, unsigned char options)
110 /* Set the options */
111 if (p == NULL)
112 return -1;
114 p->options = options;
115 return 0;
119 csv_init(struct csv_parser *p, unsigned char options)
121 /* Initialize a csv_parser object returns 0 on success, -1 on error */
122 if (p == NULL)
123 return -1;
125 p->entry_buf = NULL;
126 p->pstate = ROW_NOT_BEGUN;
127 p->quoted = 0;
128 p->spaces = 0;
129 p->entry_pos = 0;
130 p->entry_size = 0;
131 p->status = 0;
132 p->options = options;
133 p->quote_char = CSV_QUOTE;
134 p->delim_char = CSV_COMMA;
135 p->is_space = NULL;
136 p->is_term = NULL;
137 p->blk_size = MEM_BLK_SIZE;
138 p->malloc_func = NULL;
139 p->realloc_func = realloc;
140 p->free_func = free;
142 return 0;
145 void
146 csv_free(struct csv_parser *p)
148 /* Free the entry_buffer of csv_parser object */
149 if (p == NULL)
150 return;
152 if (p->entry_buf)
153 p->free_func(p->entry_buf);
155 p->entry_buf = NULL;
156 p->entry_size = 0;
158 return;
162 csv_fini(struct csv_parser *p, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
164 /* Finalize parsing. Needed, for example, when file does not end in a newline */
165 int quoted = p->quoted;
166 int pstate = p->pstate;
167 size_t spaces = p->spaces;
168 size_t entry_pos = p->entry_pos;
170 if (p == NULL)
171 return -1;
174 if (p->pstate == FIELD_BEGUN && p->quoted && p->options & CSV_STRICT && p->options & CSV_STRICT_FINI) {
175 /* Current field is quoted, no end-quote was seen, and CSV_STRICT_FINI is set */
176 p->status = CSV_EPARSE;
177 return -1;
180 switch (p->pstate) {
181 case FIELD_MIGHT_HAVE_ENDED:
182 p->entry_pos -= p->spaces + 1; /* get rid of spaces and original quote */
183 /* Fall-through */
184 case FIELD_NOT_BEGUN:
185 case FIELD_BEGUN:
186 quoted = p->quoted, pstate = p->pstate;
187 spaces = p->spaces, entry_pos = p->entry_pos;
188 SUBMIT_FIELD(p);
189 SUBMIT_ROW(p, -1);
190 case ROW_NOT_BEGUN: /* Already ended properly */
194 /* Reset parser */
195 p->spaces = p->quoted = p->entry_pos = p->status = 0;
196 p->pstate = ROW_NOT_BEGUN;
198 return 0;
201 void
202 csv_set_delim(struct csv_parser *p, unsigned char c)
204 /* Set the delimiter */
205 if (p) p->delim_char = c;
208 void
209 csv_set_quote(struct csv_parser *p, unsigned char c)
211 /* Set the quote character */
212 if (p) p->quote_char = c;
215 unsigned char
216 csv_get_delim(struct csv_parser *p)
218 /* Get the delimiter */
219 return p->delim_char;
222 unsigned char
223 csv_get_quote(struct csv_parser *p)
225 /* Get the quote character */
226 return p->quote_char;
229 void
230 csv_set_space_func(struct csv_parser *p, int (*f)(unsigned char))
232 /* Set the space function */
233 if (p) p->is_space = f;
236 void
237 csv_set_term_func(struct csv_parser *p, int (*f)(unsigned char))
239 /* Set the term function */
240 if (p) p->is_term = f;
243 void
244 csv_set_realloc_func(struct csv_parser *p, void *(*f)(void *, size_t))
246 /* Set the realloc function used to increase buffer size */
247 if (p && f) p->realloc_func = f;
250 void
251 csv_set_free_func(struct csv_parser *p, void (*f)(void *))
253 /* Set the free function used to free the buffer */
254 if (p && f) p->free_func = f;
257 void
258 csv_set_blk_size(struct csv_parser *p, size_t size)
260 /* Set the block size used to increment buffer size */
261 if (p) p->blk_size = size;
264 size_t
265 csv_get_buffer_size(struct csv_parser *p)
267 /* Get the size of the entry buffer */
268 if (p)
269 return p->entry_size;
270 return 0;
273 static int
274 csv_increase_buffer(struct csv_parser *p)
276 /* Increase the size of the entry buffer. Attempt to increase size by
277 * p->blk_size, if this is larger than SIZE_MAX try to increase current
278 * buffer size to SIZE_MAX. If allocation fails, try to allocate halve
279 * the size and try again until successful or increment size is zero.
282 size_t to_add = p->blk_size;
283 void *vp;
285 if ( p->entry_size >= SIZE_MAX - to_add )
286 to_add = SIZE_MAX - p->entry_size;
288 if (!to_add) {
289 p->status = CSV_ETOOBIG;
290 return -1;
293 while ((vp = p->realloc_func(p->entry_buf, p->entry_size + to_add)) == NULL) {
294 to_add /= 2;
295 if (!to_add) {
296 p->status = CSV_ENOMEM;
297 return -1;
301 /* Update entry buffer pointer and entry_size if successful */
302 p->entry_buf = vp;
303 p->entry_size += to_add;
304 return 0;
307 size_t
308 csv_parse(struct csv_parser *p, const void *s, size_t len, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
310 unsigned const char *us = s; /* Access input data as array of unsigned char */
311 unsigned char c; /* The character we are currently processing */
312 size_t pos = 0; /* The number of characters we have processed in this call */
314 /* Store key fields into local variables for performance */
315 unsigned char delim = p->delim_char;
316 unsigned char quote = p->quote_char;
317 int (*is_space)(unsigned char) = p->is_space;
318 int (*is_term)(unsigned char) = p->is_term;
319 int quoted = p->quoted;
320 int pstate = p->pstate;
321 size_t spaces = p->spaces;
322 size_t entry_pos = p->entry_pos;
325 if (!p->entry_buf && pos < len) {
326 /* Buffer hasn't been allocated yet and len > 0 */
327 if (csv_increase_buffer(p) != 0) {
328 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
329 return pos;
333 while (pos < len) {
334 /* Check memory usage, increase buffer if neccessary */
335 if (entry_pos == ((p->options & CSV_APPEND_NULL) ? p->entry_size - 1 : p->entry_size) ) {
336 if (csv_increase_buffer(p) != 0) {
337 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
338 return pos;
342 c = us[pos++];
344 switch (pstate) {
345 case ROW_NOT_BEGUN:
346 case FIELD_NOT_BEGUN:
347 if (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) { /* Space or Tab */
348 continue;
349 } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
350 if (pstate == FIELD_NOT_BEGUN) {
351 SUBMIT_FIELD(p);
352 SUBMIT_ROW(p, (unsigned char)c);
353 } else { /* ROW_NOT_BEGUN */
354 /* Don't submit empty rows by default */
355 if (p->options & CSV_REPALL_NL) {
356 SUBMIT_ROW(p, (unsigned char)c);
359 continue;
360 } else if (c == delim) { /* Comma */
361 SUBMIT_FIELD(p);
362 break;
363 } else if (c == quote) { /* Quote */
364 pstate = FIELD_BEGUN;
365 quoted = 1;
366 } else { /* Anything else */
367 pstate = FIELD_BEGUN;
368 quoted = 0;
369 SUBMIT_CHAR(p, c);
371 break;
372 case FIELD_BEGUN:
373 if (c == quote) { /* Quote */
374 if (quoted) {
375 SUBMIT_CHAR(p, c);
376 pstate = FIELD_MIGHT_HAVE_ENDED;
377 } else {
378 /* STRICT ERROR - double quote inside non-quoted field */
379 if (p->options & CSV_STRICT) {
380 p->status = CSV_EPARSE;
381 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
382 return pos-1;
384 SUBMIT_CHAR(p, c);
385 spaces = 0;
387 } else if (c == delim) { /* Comma */
388 if (quoted) {
389 SUBMIT_CHAR(p, c);
390 } else {
391 SUBMIT_FIELD(p);
393 } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
394 if (!quoted) {
395 SUBMIT_FIELD(p);
396 SUBMIT_ROW(p, (unsigned char)c);
397 } else {
398 SUBMIT_CHAR(p, c);
400 } else if (!quoted && (is_space? is_space(c) : c == CSV_SPACE || c == CSV_TAB)) { /* Tab or space for non-quoted field */
401 SUBMIT_CHAR(p, c);
402 spaces++;
403 } else { /* Anything else */
404 SUBMIT_CHAR(p, c);
405 spaces = 0;
407 break;
408 case FIELD_MIGHT_HAVE_ENDED:
409 /* This only happens when a quote character is encountered in a quoted field */
410 if (c == delim) { /* Comma */
411 entry_pos -= spaces + 1; /* get rid of spaces and original quote */
412 SUBMIT_FIELD(p);
413 } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
414 entry_pos -= spaces + 1; /* get rid of spaces and original quote */
415 SUBMIT_FIELD(p);
416 SUBMIT_ROW(p, (unsigned char)c);
417 } else if (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) { /* Space or Tab */
418 SUBMIT_CHAR(p, c);
419 spaces++;
420 } else if (c == quote) { /* Quote */
421 if (spaces) {
422 /* STRICT ERROR - unescaped double quote */
423 if (p->options & CSV_STRICT) {
424 p->status = CSV_EPARSE;
425 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
426 return pos-1;
428 spaces = 0;
429 SUBMIT_CHAR(p, c);
430 } else {
431 /* Two quotes in a row */
432 pstate = FIELD_BEGUN;
434 } else { /* Anything else */
435 /* STRICT ERROR - unescaped double quote */
436 if (p->options & CSV_STRICT) {
437 p->status = CSV_EPARSE;
438 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
439 return pos-1;
441 pstate = FIELD_BEGUN;
442 spaces = 0;
443 SUBMIT_CHAR(p, c);
445 break;
446 default:
447 break;
450 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
451 return pos;
454 size_t
455 csv_write (void *dest, size_t dest_size, const void *src, size_t src_size)
457 unsigned char *cdest = dest;
458 const unsigned char *csrc = src;
459 size_t chars = 0;
461 if (src == NULL)
462 return 0;
464 if (cdest == NULL)
465 dest_size = 0;
467 if (dest_size > 0)
468 *cdest++ = '"';
469 chars++;
471 while (src_size) {
472 if (*csrc == '"') {
473 if (dest_size > chars)
474 *cdest++ = '"';
475 if (chars < SIZE_MAX) chars++;
477 if (dest_size > chars)
478 *cdest++ = *csrc;
479 if (chars < SIZE_MAX) chars++;
480 src_size--;
481 csrc++;
484 if (dest_size > chars)
485 *cdest = '"';
486 if (chars < SIZE_MAX) chars++;
488 return chars;
492 csv_fwrite (FILE *fp, const void *src, size_t src_size)
494 const unsigned char *csrc = src;
496 if (fp == NULL || src == NULL)
497 return 0;
499 if (fputc('"', fp) == EOF)
500 return EOF;
502 while (src_size) {
503 if (*csrc == '"') {
504 if (fputc('"', fp) == EOF)
505 return EOF;
507 if (fputc(*csrc, fp) == EOF)
508 return EOF;
509 src_size--;
510 csrc++;
513 if (fputc('"', fp) == EOF) {
514 return EOF;
517 return 0;
520 size_t
521 csv_write2 (void *dest, size_t dest_size, const void *src, size_t src_size, unsigned char quote)
523 unsigned char *cdest = dest;
524 const unsigned char *csrc = src;
525 size_t chars = 0;
527 if (src == NULL)
528 return 0;
530 if (dest == NULL)
531 dest_size = 0;
533 if (dest_size > 0)
534 *cdest++ = quote;
535 chars++;
537 while (src_size) {
538 if (*csrc == quote) {
539 if (dest_size > chars)
540 *cdest++ = quote;
541 if (chars < SIZE_MAX) chars++;
543 if (dest_size > chars)
544 *cdest++ = *csrc;
545 if (chars < SIZE_MAX) chars++;
546 src_size--;
547 csrc++;
550 if (dest_size > chars)
551 *cdest = quote;
552 if (chars < SIZE_MAX) chars++;
554 return chars;
558 csv_fwrite2 (FILE *fp, const void *src, size_t src_size, unsigned char quote)
560 const unsigned char *csrc = src;
562 if (fp == NULL || src == NULL)
563 return 0;
565 if (fputc(quote, fp) == EOF)
566 return EOF;
568 while (src_size) {
569 if (*csrc == quote) {
570 if (fputc(quote, fp) == EOF)
571 return EOF;
573 if (fputc(*csrc, fp) == EOF)
574 return EOF;
575 src_size--;
576 csrc++;
579 if (fputc(quote, fp) == EOF) {
580 return EOF;
583 return 0;
586 /* End of libcsv.c */