1 # CSV -- module for generating/parsing CSV data.
2 # Copyright (C) 2000-2004 NAKAMURA, Hiroshi <nakahiro@sarion.co.jp>.
4 # $Id: csv.rb 11708 2007-02-12 23:01:19Z shyouhei $
6 # This program is copyrighted free software by NAKAMURA, Hiroshi. You can
7 # redistribute it and/or modify it under the same terms of Ruby's license;
8 # either the dual license version in 2003, or any later version.
12 class IllegalFormatError < RuntimeError; end
16 def initialize(data = "", is_null = false)
17 super(is_null ? "" : data)
29 # Open a CSV formatted file for reading or writing.
34 # CSV.open('csvfile.csv', 'r') do |row|
39 # reader = CSV.open('csvfile.csv', 'r')
48 # filename: filename to parse.
49 # col_sep: Column separator. ?, by default. If you want to separate
50 # fields with semicolon, give ?; here.
51 # row_sep: Row separator. nil by default. nil means "\r\n or \n". If you
52 # want to separate records with \r, give ?\r here.
55 # reader instance. To get parse result, see CSV::Reader#each.
61 # CSV.open('csvfile.csv', 'w') do |writer|
62 # writer << ['r1c1', 'r1c2']
63 # writer << ['r2c1', 'r2c2']
64 # writer << [nil, nil]
68 # writer = CSV.open('csvfile.csv', 'w')
69 # writer << ['r1c1', 'r1c2'] << ['r2c1', 'r2c2'] << [nil, nil]
73 # filename: filename to generate.
74 # col_sep: Column separator. ?, by default. If you want to separate
75 # fields with semicolon, give ?; here.
76 # row_sep: Row separator. nil by default. nil means "\r\n or \n". If you
77 # want to separate records with \r, give ?\r here.
80 # writer instance. See CSV::Writer#<< and CSV::Writer#add_row to know how
81 # to generate CSV string.
83 def CSV.open(path, mode, fs = nil, rs = nil, &block)
84 if mode == 'r' or mode == 'rb'
85 open_reader(path, mode, fs, rs, &block)
86 elsif mode == 'w' or mode == 'wb'
87 open_writer(path, mode, fs, rs, &block)
89 raise ArgumentError.new("'mode' must be 'r', 'rb', 'w', or 'wb'")
93 def CSV.foreach(path, rs = nil, &block)
94 open_reader(path, 'r', ',', rs, &block)
97 def CSV.read(path, length = nil, offset = nil)
98 CSV.parse(IO.read(path, length, offset))
101 def CSV.readlines(path, rs = nil)
102 reader = open_reader(path, 'r', ',', rs)
104 reader.collect { |row| row }
110 def CSV.generate(path, fs = nil, rs = nil, &block)
111 open_writer(path, 'w', fs, rs, &block)
114 # Parse lines from given string or stream. Return rows as an Array of Arrays.
115 def CSV.parse(str_or_readable, fs = nil, rs = nil, &block)
116 if File.exist?(str_or_readable)
117 STDERR.puts("CSV.parse(filename) is deprecated." +
118 " Use CSV.open(filename, 'r') instead.")
119 return open_reader(str_or_readable, 'r', fs, rs, &block)
122 CSV::Reader.parse(str_or_readable, fs, rs) do |row|
127 CSV::Reader.create(str_or_readable, fs, rs).collect { |row| row }
131 # Parse a line from given string. Bear in mind it parses ONE LINE. Rest of
132 # the string is ignored for example "a,b\r\nc,d" => ['a', 'b'] and the
133 # second line 'c,d' is ignored.
135 # If you don't know whether a target string to parse is exactly 1 line or
136 # not, use CSV.parse_row instead of this method.
137 def CSV.parse_line(src, fs = nil, rs = nil)
142 if !rs.nil? and rs.is_a?(Fixnum)
146 res_type = :DT_COLSEP
149 while res_type == :DT_COLSEP
150 res_type, idx, cell = parse_body(src, idx, fs, rs)
153 rescue IllegalFormatError
159 # Create a line from cells. each cell is stringified by to_s.
160 def CSV.generate_line(row, fs = nil, rs = nil)
168 if !rs.nil? and rs.is_a?(Fixnum)
171 res_type = :DT_COLSEP
175 generate_body(row[idx], result_str, fs, rs)
180 generate_separator(:DT_COLSEP, result_str, fs, rs)
185 # Parse a line from string. Consider using CSV.parse_line instead.
186 # To parse lines in CSV string, see EXAMPLE below.
189 # src = "a,b\r\nc,d\r\ne,f"
193 # parsed_cells, idx = CSV.parse_row(src, idx, parsed)
194 # puts "Parsed #{ parsed_cells } cells."
196 # end while parsed_cells > 0
199 # src: a CSV data to be parsed. Must respond '[](idx)'.
200 # src[](idx) must return a char. (Not a string such as 'a', but 97).
201 # src[](idx_out_of_bounds) must return nil. A String satisfies this
203 # idx: index of parsing location of 'src'. 0 origin.
204 # out_dev: buffer for parsed cells. Must respond '<<(aString)'.
205 # col_sep: Column separator. ?, by default. If you want to separate
206 # fields with semicolon, give ?; here.
207 # row_sep: Row separator. nil by default. nil means "\r\n or \n". If you
208 # want to separate records with \r, give ?\r here.
211 # parsed_cells: num of parsed cells.
212 # idx: index of next parsing location of 'src'.
214 def CSV.parse_row(src, idx, out_dev, fs = nil, rs = nil)
219 if !rs.nil? and rs.is_a?(Fixnum)
224 res_type = :DT_COLSEP
226 while res_type != :DT_ROWSEP
227 res_type, idx, cell = parse_body(src, idx, fs, rs)
228 if res_type == :DT_EOS
229 if idx == idx_backup #((parsed_cells == 0) and cell.nil?)
232 res_type = :DT_ROWSEP
237 rescue IllegalFormatError
240 return parsed_cells, idx
243 # Convert a line from cells data to string. Consider using CSV.generate_line
244 # instead. To generate multi-row CSV string, see EXAMPLE below.
250 # src = [row1, row2, row3]
253 # parsed_cells = CSV.generate_row(row, 2, buf)
254 # puts "Created #{ parsed_cells } cells."
259 # src: an Array of String to be converted to CSV string. Must respond to
260 # 'size' and '[](idx)'. src[idx] must return String.
261 # cells: num of cells in a line.
262 # out_dev: buffer for generated CSV string. Must respond to '<<(string)'.
263 # col_sep: Column separator. ?, by default. If you want to separate
264 # fields with semicolon, give ?; here.
265 # row_sep: Row separator. nil by default. nil means "\r\n or \n". If you
266 # want to separate records with \r, give ?\r here.
269 # parsed_cells: num of converted cells.
271 def CSV.generate_row(src, cells, out_dev, fs = nil, rs = nil)
276 if !rs.nil? and rs.is_a?(Fixnum)
282 generate_separator(:DT_ROWSEP, out_dev, fs, rs)
286 res_type = :DT_COLSEP
288 generate_body(src[parsed_cells], out_dev, fs, rs)
290 while ((parsed_cells < cells) and (parsed_cells != src_size))
291 generate_separator(:DT_COLSEP, out_dev, fs, rs)
292 generate_body(src[parsed_cells], out_dev, fs, rs)
295 if (parsed_cells == cells)
296 generate_separator(:DT_ROWSEP, out_dev, fs, rs)
298 generate_separator(:DT_COLSEP, out_dev, fs, rs)
303 # Private class methods.
307 def open_reader(path, mode, fs, rs, &block)
308 file = File.open(path, mode)
311 CSV::Reader.parse(file, fs, rs) do |row|
319 reader = CSV::Reader.create(file, fs, rs)
320 reader.close_on_terminate
325 def open_writer(path, mode, fs, rs, &block)
326 file = File.open(path, mode)
329 CSV::Writer.generate(file, fs, rs) do |writer|
337 writer = CSV::Writer.create(file, fs, rs)
338 writer.close_on_terminate
343 def parse_body(src, idx, fs, rs)
345 fs_size = fs_str.size
347 rs_size = rs_str.size
356 fschar = (c == fs_str[fs_idx])
357 rschar = (c == rs_str[rs_idx])
358 # simple 1 char backtrack
359 if !fschar and c == fs_str[0]
362 if state == :ST_START
364 elsif state == :ST_QUOTE
365 raise IllegalFormatError
368 if !rschar and c == rs_str[0]
371 if state == :ST_START
373 elsif state == :ST_QUOTE
374 raise IllegalFormatError
381 raise IllegalFormatError
383 cell << src[last_idx, (idx - last_idx)]
391 raise IllegalFormatError
393 elsif state == :ST_QUOTE
403 elsif fschar or rschar
412 if state == :ST_START and rs_idx > 0 and fs_idx < rs_idx
415 cell << src[last_idx, (idx - last_idx - (fs_size - 1))]
419 raise IllegalFormatError
422 elsif rs_idx == rs_size
423 if state == :ST_START and fs_idx > 0 and rs_idx < fs_idx
427 cell << src[last_idx, (idx - last_idx - (rs_size - 1))]
435 return sep, idx + 1, cell;
436 elsif state == :ST_QUOTE
437 return sep, idx + 1, cell;
439 return sep, idx + 1, nil
442 elsif rs.nil? and c == ?\r
443 # special \r treatment for backward compatibility
446 raise IllegalFormatError
448 cell << src[last_idx, (idx - last_idx)]
457 if state == :ST_DATA or state == :ST_START
459 raise IllegalFormatError
463 raise IllegalFormatError
468 if state == :ST_START
469 if fs_idx > 0 or rs_idx > 0
472 return :DT_EOS, idx, nil
475 raise IllegalFormatError
477 raise IllegalFormatError
479 cell << src[last_idx, (idx - last_idx)]
481 return :DT_EOS, idx, cell
484 def generate_body(cell, out_dev, fs, rs)
490 if (row_data.gsub!('"', '""') or
491 row_data.index(fs) or
492 (rs and row_data.index(rs)) or
493 (/[\r\n]/ =~ row_data) or
495 out_dev << '"' << row_data << '"'
502 def generate_separator(type, out_dev, fs, rs)
507 out_dev << (rs || "\n")
513 # CSV formatted string/stream reader.
516 # read CSV lines untill the first column is 'stop'.
518 # CSV::Reader.parse(File.open('bigdata', 'rb')) do |row|
520 # break if !row[0].is_null && row[0].data == 'stop'
526 # Parse CSV data and get lines. Given block is called for each parsed row.
527 # Block value is always nil. Rows are not cached for performance reason.
528 def Reader.parse(str_or_readable, fs = ',', rs = nil, &block)
529 reader = Reader.create(str_or_readable, fs, rs)
541 # Returns reader instance.
542 def Reader.create(str_or_readable, fs = ',', rs = nil)
545 IOReader.new(str_or_readable, fs, rs)
547 StringReader.new(str_or_readable, fs, rs)
549 IOReader.new(str_or_readable, fs, rs)
556 parsed_cells = get_row(row)
567 parsed_cells = get_row(row)
578 raise RuntimeError.new('Do not instanciate this class directly.')
582 raise NotImplementedError.new('Method get_row must be defined in a derived class.')
591 class StringReader < Reader
592 def initialize(string, fs = ',', rs = nil)
597 if @dev[0, 3] == "\xef\xbb\xbf"
605 parsed_cells, next_idx = CSV.parse_row(@dev, @idx, row, @fs, @rs)
606 if parsed_cells == 0 and next_idx == 0 and @idx != @dev.size
607 raise IllegalFormatError.new
615 class IOReader < Reader
616 def initialize(io, fs = ',', rs = nil)
620 @dev = CSV::IOBuf.new(@io)
622 if @dev[0] == 0xef and @dev[1] == 0xbb and @dev[2] == 0xbf
625 @close_on_terminate = false
628 # Tell this reader to close the IO when terminated (Triggered by invoking
629 # CSV::IOReader#close).
630 def close_on_terminate
631 @close_on_terminate = true
637 parsed_cells, next_idx = CSV.parse_row(@dev, @idx, row, @fs, @rs)
638 if parsed_cells == 0 and next_idx == 0 and !@dev.is_eos?
639 raise IllegalFormatError.new
641 dropped = @dev.drop(next_idx)
642 @idx = next_idx - dropped
647 if @close_on_terminate
658 # CSV formatted string/stream writer.
661 # Write rows to 'csvout' file.
663 # outfile = File.open('csvout', 'wb')
664 # CSV::Writer.generate(outfile) do |csv|
665 # csv << ['c1', nil, '', '"', "\r\n", 'c2']
672 # Given block is called with the writer instance. str_or_writable must
673 # handle '<<(string)'.
674 def Writer.generate(str_or_writable, fs = ',', rs = nil, &block)
675 writer = Writer.create(str_or_writable, fs, rs)
685 # str_or_writable must handle '<<(string)'.
686 def Writer.create(str_or_writable, fs = ',', rs = nil)
687 BasicWriter.new(str_or_writable, fs, rs)
690 # dump CSV stream to the device. argument must be an Array of String.
692 CSV.generate_row(row, row.size, @dev, @fs, @rs)
704 raise RuntimeError.new('Do not instanciate this class directly.')
713 class BasicWriter < Writer
714 def initialize(str_or_writable, fs = ',', rs = nil)
717 @dev = str_or_writable
718 @close_on_terminate = false
721 # Tell this writer to close the IO when terminated (Triggered by invoking
722 # CSV::BasicWriter#close).
723 def close_on_terminate
724 @close_on_terminate = true
730 if @close_on_terminate
740 # EXAMPLE 1 -- an IO.
741 # class MyBuf < StreamBuf
742 # # Do initialize myself before a super class. Super class might call my
743 # # method 'read'. (Could be awful for C++ user. :-)
749 # # define my own 'read' method.
750 # # CAUTION: Returning nil means EnfOfStream.
755 # # release buffers. in Ruby which has GC, you do not have to call this...
762 # buf = MyBuf.new(STDIN)
764 # p buf[0, 0] # => '' (null string)
765 # p buf[0] # => 97 (char code of 'a')
766 # p buf[0, 1] # => 'a'
768 # p my_str # => 'abcde' (5 chars)
769 # p buf[0, 6] # => "abcde\n" (6 chars)
770 # p buf[0, 7] # => "abcde\n" (6 chars)
771 # p buf.drop(3) # => 3 (dropped chars)
772 # p buf.get(0, 2) # => 'de' (2 chars)
773 # p buf.is_eos? # => false (is not EOS here)
774 # p buf.drop(5) # => 3 (dropped chars)
775 # p buf.is_eos? # => true (is EOS here)
776 # p buf[0] # => nil (is EOS here)
778 # EXAMPLE 2 -- String.
779 # This is a conceptual example. No pros with this.
781 # class StrBuf < StreamBuf
789 # str = @str[@idx, size]
796 # get a char or a partial string from the stream.
797 # idx: index of a string to specify a start point of a string to get.
798 # unlike String instance, idx < 0 returns nil.
799 # n: size of a string to get.
800 # returns char at idx if n == nil.
801 # returns a partial string, from idx to (idx + n) if n != nil. at EOF,
802 # the string size could not equal to arg n.
807 if (idx_is_eos?(idx))
808 if n and (@offset + idx == buf_size(@cur_buf))
809 # Like a String, 'abc'[4, 1] returns nil and
810 # 'abc'[3, 1] returns '' not nil.
819 while (my_offset + next_idx >= buf_size(my_buf))
820 if (my_buf == @buf_tail_idx)
825 next_idx = my_offset + next_idx - buf_size(my_buf)
829 loc = my_offset + next_idx
831 return @buf_list[my_buf][loc] # Fixnum of char code.
832 elsif (loc + n - 1 < buf_size(my_buf))
833 return @buf_list[my_buf][loc, n] # String.
834 else # should do loop insted of (tail) recursive call...
835 res = @buf_list[my_buf][loc, BufSize]
836 size_added = buf_size(my_buf) - loc
850 # drop a string from the stream.
851 # returns dropped size. at EOF, dropped size might not equals to arg n.
852 # Once you drop the head of the stream, access to the dropped part via []
853 # or get returns nil.
860 if !@is_eos or (@cur_buf != @buf_tail_idx)
861 if (@offset + n < buf_size(@cur_buf))
866 size = buf_size(@cur_buf) - @offset
874 @cur_buf = @buf_tail_idx
883 return idx_is_eos?(0)
886 # WARN: Do not instantiate this class directly. Define your own class
887 # which derives this class and define 'read' instance method.
890 @cur_buf = @buf_tail_idx = -1
894 @cur_buf = @buf_tail_idx
903 # protected method 'read' must be defined in derived classes.
904 # CAUTION: Returning a string which size is not equal to 'size' means
905 # EnfOfStream. When it is not at EOS, you must block the callee, try to
906 # read and return the sized string.
907 def read(size) # raise EOFError
908 raise NotImplementedError.new('Method read must be defined in a derived class.')
922 str_read = read(BufSize)
935 @buf_list.push(str_read)
945 @buf_list[@cur_buf] = nil
946 if (@cur_buf == @buf_tail_idx)
956 (@is_eos and ((@cur_buf < 0) or (@cur_buf == @buf_tail_idx)))
965 # # File 'bigdata' could be a giga-byte size one!
966 # buf = CSV::IOBuf.new(File.open('bigdata', 'rb'))
967 # CSV::Reader.new(buf).each do |row|
969 # break if row[0].data == 'admin'
972 class IOBuf < StreamBuf