new tool
[hband-tools.git] / tabdata / vcf2td
blob8c119ed277cfb7299a500d0fbb61ea398187fd25
1 #!/usr/bin/env perl
3 =pod
5 =head1 NAME
7 vcf2td - Transform VCF to tabular data format.
9 =head1 OPTIONS
11 =over 4
13 =item -c, --column I<COLUMN>
15 Indicate that there will be a column by the name I<COLUMN>.
16 Useful if the first record does not contain all fields
17 which are otherwise occur in the whole data stream.
18 By default, vcf2td(1) recognize fields which are in the first record in the VCF input,
19 does not read ahead more records before sending the header.
20 This option is repeatable.
22 =item -i, --ignore-non-existing-columns
24 Don't fail and don't warn when ecountering new field names.
26 Tabular data format declares all of the field names in the column headers,
27 so it can not introduce new columns later on in the data stream
28 (unless some records were buffered which are not currently).
29 However in VCF, each record may have fields different from the first record.
30 That's why vcf2td(1) fails itself by default
31 if it encounters a field it can not convert to tabular.
33 =item -w, --warn-non-existing-columns
35 Only warns on new fields, but don't fail.
37 =item -g, --multivalue-glue I<STR>
39 A string to glue repeated fields' values together
40 when the repeated fields are handled by uniting their content into one tabdata column.
41 Default is newline.
43 Note, eventhough newline is the default glue, but
44 if you want to be explicit about it (or want to set an other glue I<STR> expressed often by some backslash sequence),
45 C<vcf2td -g "\n" ...> probably won't quite work as one may expect (depending on one's shell),
46 because the shell passes the "backslash" + "n" 2-chars string,
47 instead of a string consisting just 1 "newline" char.
48 So, in bash, put it as C<vcf2td -g $'\n' ...>.
50 =back
52 =head1 COMMON vCard FIELDS
54 =over 4
56 =item N
58 B<N> is for a contact's name, different parts separated by C<;> semicolon.
59 vcf2td(1) simplifies the B<N> field by removing excess semicolons.
60 If you need one or more name parts precisely,
61 request the B<N.family>, B<N.given>, B<N.middle>, B<N.prefixes> fields
62 by the B<-c> option if you want,
63 but this name partitioning method is not quite internationally useful,
64 use the B<FN> (full name) field for persons' names as much as you can.
66 =back
68 =cut
71 $OptMultiValueGlue = "\n";
72 $OptWarnBadColumnNames = 1;
73 $OptFailBadColumnNames = 1;
74 @OptPredefColumns = ();
75 %OptionDefs = (
76 'g|multivalue-glue=s' => \$OptMultiValueGlue,
77 'i|ignore-non-existing-columns' => sub { $OptFailBadColumnNames = 0; $OptWarnBadColumnNames = 0; },
78 'w|warn-non-existing-columns' => sub { $OptFailBadColumnNames = 0; $OptWarnBadColumnNames = 1; },
79 'c|column=s@' => \@OptPredefColumns,
82 use Data::Dumper;
83 use Encode;
84 use MIME::QuotedPrint;
85 use Switch;
86 no if ($] >= 5.018), 'warnings' => 'experimental::smartmatch';
87 do '/usr/lib/tool/perl5/tabdata/common.pl' or die "$@";
89 @columns = @OptPredefColumns;
90 @warned_fields = ();
91 %record = ();
92 $RN = 0;
95 sub write_record
97 my $record = shift;
99 if($RN == 0)
101 print join($FS, map {escape_tabdata($_)} @columns) . $RS;
104 print join($FS, map {escape_tabdata($record->{$_})} @columns) . $RS;
108 while(<STDIN>)
110 s/\r?\n//;
112 if(my($field, $data) = /^(\S[^:]*):(.*)$/)
114 my($field_name, @params) = split /;/, $field;
115 my %param = map {split /=/, $_, 2} @params;
117 if(not $field_name ~~ @columns)
119 if(not $field_name ~~ [qw/BEGIN VERSION END/])
121 if($RN == 0)
123 push @columns, $field_name
125 else
127 unless($field_name ~~ @warned_fields)
129 warn "$0: column not defined: $field_name\n" if $OptWarnBadColumnNames;
130 push @warned_fields, $field_name;
132 die if $OptFailBadColumnNames;
133 next;
138 switch($param{'ENCODING'})
140 case('QUOTED-PRINTABLE') { $data = decode_qp($data); }
141 case(undef) { 1; }
142 else { die "$0: unknown encoding: $param{'ENCODING'}, line $., record $RN\n"; }
145 if($field_name eq 'N')
147 my ($family_name, $given_name, $middle_name, $prefixes) = split /;/, $data;
148 $record{'N.family'} = $family_name;
149 $record{'N.given'} = $given_name;
150 $record{'N.middle'} = $middle_name;
151 $record{'N.prefixes'} = $prefixes;
153 $data =~ s/;+/;/g;
154 $data =~ s/;$//;
155 $data =~ s/^;//;
158 if(exists $record{$field_name})
160 $record{$field_name} .= $OptMultiValueGlue.$data;
162 else
164 $record{$field_name} = $data;
167 if($field_name eq 'END')
169 write_record(\%record);
170 %record = ();
171 $RN++;
172 next;
177 # write last incomplete record
178 if(%record)
180 write_record(\%record);