blob: 184149b5448dea6b2bd4a6d17824fb2901dc74a8 [file] [log] [blame]
yu.dongc33b3072024-08-21 23:14:49 -07001package Parse::CSV;
2
3=pod
4
5=head1 NAME
6
7Parse::CSV - Highly flexible CVS parser for large files
8
9=head1 SYNOPSIS
10
11 # Simple headerless comma-seperated column parser
12 my $simple = Parse::CSV->new(
13 file => 'file.csv',
14 );
15 while ( my $array_ref = $simple->fetch ) {
16 # Do something...
17 }
18
19... or a more complex example...
20
21 # Parse a colon-seperated variables file from a handle as a hash
22 # based on headers from the first line.
23 # Then filter, so we emit objects rather than the plain hash.
24 my $objects = Parse::CSV->new(
25 handle => $io_handle,
26 sep_char => ';',
27 fields => 'auto',
28 filter => sub { My::Object->new( $_ ) },
29 );
30 while ( my $object = $objects->fetch ) {
31 $object->do_something;
32 }
33
34=head1 DESCRIPTION
35
36Surely the CPAN doesn't need yet another CSV parsing module.
37
38L<Text::CSV_XS> is the standard parser for CSV files. It is fast as hell,
39but unfortunately it can be a bit verbose to use.
40
41A number of other modules have attempted to put usability wrappers around
42this venerable module, but they have all focussed on parsing the entire
43file into memory at once.
44
45This method is fine unless your CSV files start to get large. Once that
46happens, the only existing option is to fall back on the relatively slow
47and heavyweight L<XML::SAXDriver::CSV> module.
48
49L<Parse::CSV> fills this functionality gap. It provides a flexible
50and light-weight streaming parser for large, extremely large, or
51arbitrarily large CSV files.
52
53=head2 Main Features
54
55B<Stream-Based Parser> - All parsing a line at a time.
56
57B<Array Mode> - Parsing can be done in simple array mode, returning
58a reference to an array if the columns are not named.
59
60B<Hash Mode> - Parsing can be done in hash mode, putting the data into
61a hash and return a reference to it.
62
63B<Filter Capability> - All items returned can be passed through a
64custom filter. This filter can either modify the data on the fly,
65or drop records you don't need.
66
67=head2 Writing Filters
68
69A L<Parse::CSV> filter is a subroutine reference that is passed the raw
70record as C<$_>, and should C<return> the alternative or modified record
71to return to the user.
72
73The null filter (does not modify or drop any records) looks like the
74following.
75
76 sub { $_ };
77
78A filter which reversed the order of the columns (assuming they are
79passed as an array) might look like the following.
80
81 sub { return [ reverse @$_ ] };
82
83To drop the record, you should return C<undef> from the filter. The
84parser will then keep pulling and parsing new records until one
85passes the filter.
86
87 # Only keep records where foo is true
88 sub { $_->{foo} ? $_ : undef }
89
90To signal an error, throw an exception
91
92 sub {
93 $_->{foo} =~ /bar/ or die "Assumption failed";
94 return $_;
95 }
96
97=head1 METHODS
98
99=cut
100
101use 5.005;
102use strict;
103use Carp ();
104use IO::File ();
105use Text::CSV_XS ();
106use Params::Util qw{ _STRING _ARRAY _HASH0 _CODELIKE _HANDLE };
107
108use vars qw{$VERSION};
109BEGIN {
110 $VERSION = '1.00';
111}
112
113
114
115
116
117#####################################################################
118# Constructor
119
120=pod
121
122=head2 new
123
124The C<new> constructor creates and initialise a new CSV parser.
125
126It takes a number of params.
127
128To specify the CSV data source, you should provide either the C<file>
129param, which should be the name of the file to read, or the C<handle>
130param, which should be a file handle to read instead.
131
132The actual parsing is done using L<Text::CSV_XS>. Any of it's
133constructor/parsing params can also be provided to this C<new> method,
134and they will be passed on.
135
136Alternatively, they can be passed as a single C<HASH> reference as the
137C<csv_attr> param. For example:
138
139 $parser = Parse::CSV->new(
140 file => 'file.csv',
141 csv_attr => {
142 sep_char => ';',
143 quote_char => "'",
144 },
145 );
146
147An optional C<fields> param can be provided, which should be an array
148reference containing the names of the columns in the CSV file.
149
150 $parser = Parse::CSV->new(
151 file => 'file.csv',
152 fields => [ 'col1', 'col2', 'col3' ],
153 );
154
155If the C<fields> param is provided, the parser will map the columns to a
156hash where the keys are the field names provided, and the values are the
157values found in the CSV file.
158
159If the C<fields> param is B<not> provided, the parser will return simple
160array references of the columns.
161
162If the C<fields> param is the string 'auto', the fields will be
163automatically determined by reading the first line of the CSV file and
164using those values as the field names.
165
166The optional C<filter> param will be used to filter the records if
167provided. It should be a C<CODE> reference or any otherwise callable
168scalar, and each value parsed (either array reference or hash reference)
169will be passed to the filter to be changed or converted into an object,
170or whatever you wish.
171
172Returns a new L<Parse::CSV> object, or throws an exception (dies) on error.
173
174=cut
175
176sub new {
177 my $class = shift;
178 my $self = bless { @_,
179 row => 0,
180 errstr => '',
181 }, $class;
182
183 # Do we have a file name
184 if ( exists $self->{file} ) {
185 unless ( _STRING($self->{file}) ) {
186 Carp::croak("Parse::CSV file param is not a string");
187 }
188 unless ( -f $self->{file} and -r _ ) {
189 Carp::croak("Parse::CSV file '$self->{file}' does not exist");
190 }
191 $self->{handle} = IO::File->new();
192 unless ( $self->{handle}->open($self->{file}) ) {
193 Carp::croak("Parse::CSV file '$self->{file}' failed to load: $!");
194 }
195 }
196
197 # Do we have a file handle
198 if ( exists $self->{handle} ) {
199 unless ( _HANDLE($self->{handle}) ) {
200 Carp::croak("Parse::CSV handle param is not an IO handle");
201 }
202 } else {
203 Carp::croak("Parse::CSV not provided a file or handle param");
204 }
205
206 # Seperate the Text::CSV attributes
207 unless ( _HASH0($self->{csv_attr}) ) {
208 $self->{csv_attr} = {};
209 foreach ( qw{quote_char eol escape_char sep_char binary always_quote} ) {
210 next unless exists $self->{$_};
211 $self->{csv_attr}->{$_} = delete $self->{$_};
212 }
213 }
214
215 # Create the parser
216 $self->{csv_xs} = Text::CSV_XS->new( $self->{csv_attr} );
217 unless ( $self->{csv_xs} ) {
218 Carp::croak("Failed to create Text::CSV_XS parser");
219 }
220
221 # Handle automatic fields
222 if ( _STRING($self->{fields}) and lc($self->{fields}) eq 'auto' ) {
223 # Grab the first line
224 my $line = $self->_getline;
225 unless ( defined $line ) {
226 Carp::croak("Failed to get header line from CSV");
227 }
228
229 # Parse the line into columns
230 unless ( $self->{csv_xs}->parse($line) ) {
231 Carp::croak(
232 "Failed to parse header line from CSV: "
233 . $self->{csv_xs}->error_input
234 );
235 }
236
237 # Turn the array ref into a hash if needed
238 my @cols = $self->{csv_xs}->fields;
239 $self->{fields} = \@cols;
240 }
241
242 # Check fields
243 if ( exists $self->{fields} and ! _ARRAY($self->{fields}) ) {
244 Carp::croak("Parse::CSV fields param is not an array reference of strings");
245 }
246
247 # Check filter
248 if ( exists $self->{filter} and ! _CODELIKE($self->{filter}) ) {
249 Carp::croak("Parse::CSV filter param is not callable");
250 }
251
252 $self;
253}
254
255
256
257
258
259#####################################################################
260# Main Methods
261
262=pod
263
264=head2 fetch
265
266Once a L<Parse::CSV> object has been created, the C<fetch> method is
267used to parse and return the next value from the CSV file.
268
269Returns an C<ARRAY>, C<HASH> or the output of the filter, based on the
270configuration of the object, or C<undef> in a variety of situations.
271
272Returning C<undef> means either some part of the parsing and filtering
273process has resulted in an error, B<or> that the end of file has been
274reached.
275
276On receiving C<undef>, you should the C<errstr> method. If it is a null
277string you have reached the end of file. Otherwise the error message will
278be returned. Thus, the basic usage of L<Parse::CSV> will look like the
279following.
280
281 my $parser = Parse::CSV->new(
282 file => 'file.csv',
283 );
284 while ( my $value = $parser->fetch ) {
285 # Do something...
286 }
287 if ( $parser->errstr ) {
288 # Handle errors...
289 }
290
291=cut
292
293sub fetch {
294 my $self = shift;
295
296 # The filter can skip rows,
297 # iterate till we get something.
298 while ( defined(my $line = $self->_getline) ) {
299 # Parse the line into columns
300 unless ( $self->{csv_xs}->parse($line) ) {
301 $self->{errstr} = "Failed to parse row $self->{row}";
302 return undef;
303 }
304
305 # Turn the array ref into a hash if needed
306 my $rv = undef;
307 my $f = $self->{fields};
308 my @cols = $self->{csv_xs}->fields;
309 if ( $f ) {
310 $rv = {};
311 foreach ( 0 .. $#$f ) {
312 $rv->{ $f->[$_] } = $cols[$_];
313 }
314 } else {
315 $rv = \@cols;
316 }
317
318 # Just return for simple uses
319 return $rv unless $self->{filter};
320
321 # Filter if needed
322 local $_ = $rv;
323 $rv = eval { $self->{filter}->() };
324 if ( $@ ) {
325 # Handle filter errors
326 $self->{errstr} = "Filter error: $@";
327 $self->{errstr} =~ s/^(.+)at line.+$/$1/;
328 return undef;
329 }
330
331 # Filter returns undef to drop a record
332 next unless defined $rv;
333
334 # We have a good record, return it
335 return $rv;
336 }
337
338 return undef;
339}
340
341sub _getline {
342 my $self = shift;
343 $self->{errstr} = '';
344
345 # Fetch the next file line
346 my $handle = $self->{handle};
347 my $line = <$handle>;
348 unless ( defined $line ) {
349 $self->{errstr} = $handle->eof ? '' : $!;
350 return undef;
351 }
352
353 # Parse the line into columns
354 $self->{row}++;
355 return $line;
356}
357
358=pod
359
360=head2 row
361
362The C<row> method returns the current row of the CSV file.
363
364This is a one-based count, so when you first create the parser,
365the value of C<row> will be zero (unless you are using
366C<fields => 'auto'> in which case it will be 1).
367
368=cut
369
370sub row {
371 $_[0]->{row};
372}
373
374=pod
375
376=head2 combine
377
378 $status = $csv->combine(@columns);
379
380The C<combine> method is provided as a convenience, and is passed through
381to the underlying L<Text::CSV_XS> object.
382
383=cut
384
385sub combine {
386 shift->combine(@_);
387}
388
389=pod
390
391=head2 string
392
393 $line = $cvs->string;
394
395The C<string> method is provided as a convenience, and is passed through
396to the underlying L<Text::CSV_XS> object.
397
398=cut
399
400sub string {
401 shift->string(@_);
402}
403
404=pod
405
406=head2 print
407
408 $status = $cvs->print($io, $columns);
409
410The C<print> method is provided as a convenience, and is passed through
411to the underlying L<Text::CSV_XS> object.
412
413=cut
414
415sub print {
416 shift->print(@_);
417}
418
419=pod
420
421=head2 errstr
422
423On error, the C<errstr> method returns the error that occured.
424
425If the last action was NOT an error, returns the null string C<''>.
426
427=cut
428
429sub errstr {
430 $_[0]->{errstr};
431}
432
4331;
434
435=pod
436
437=head1 SUPPORT
438
439Bugs should be always be reported via the CPAN bug tracker at
440
441L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Parse-CSV>
442
443For other issues, or commercial enhancement or support, contact the author.
444
445=head1 AUTHORS
446
447Adam Kennedy E<lt>adamk@cpan.orgE<gt>
448
449=head1 SEE ALSO
450
451L<Text::CSV_XS>, L<http://ali.as/>
452
453=head1 COPYRIGHT
454
455Copyright 2006 Adam Kennedy.
456
457This program is free software; you can redistribute
458it and/or modify it under the same terms as Perl itself.
459
460The full text of the license can be found in the
461LICENSE file included with this module.
462
463=cut