yu.dong | c33b307 | 2024-08-21 23:14:49 -0700 | [diff] [blame^] | 1 | package Parse::CSV;
|
| 2 |
|
| 3 | =pod
|
| 4 |
|
| 5 | =head1 NAME
|
| 6 |
|
| 7 | Parse::CSV - Highly flexible CVS parser for large files
|
| 8 |
|
| 9 | =head1 SYNOPSIS
|
| 10 |
|
| 11 | # Simple headerless comma-seperated column parser
|
| 12 | my $simple = Parse::CSV->new(
|
| 13 | file => 'file.csv',
|
| 14 | );
|
| 15 | while ( my $array_ref = $simple->fetch ) {
|
| 16 | # Do something...
|
| 17 | }
|
| 18 |
|
| 19 | ... or a more complex example...
|
| 20 |
|
| 21 | # Parse a colon-seperated variables file from a handle as a hash
|
| 22 | # based on headers from the first line.
|
| 23 | # Then filter, so we emit objects rather than the plain hash.
|
| 24 | my $objects = Parse::CSV->new(
|
| 25 | handle => $io_handle,
|
| 26 | sep_char => ';',
|
| 27 | fields => 'auto',
|
| 28 | filter => sub { My::Object->new( $_ ) },
|
| 29 | );
|
| 30 | while ( my $object = $objects->fetch ) {
|
| 31 | $object->do_something;
|
| 32 | }
|
| 33 |
|
| 34 | =head1 DESCRIPTION
|
| 35 |
|
| 36 | Surely the CPAN doesn't need yet another CSV parsing module.
|
| 37 |
|
| 38 | L<Text::CSV_XS> is the standard parser for CSV files. It is fast as hell,
|
| 39 | but unfortunately it can be a bit verbose to use.
|
| 40 |
|
| 41 | A number of other modules have attempted to put usability wrappers around
|
| 42 | this venerable module, but they have all focussed on parsing the entire
|
| 43 | file into memory at once.
|
| 44 |
|
| 45 | This method is fine unless your CSV files start to get large. Once that
|
| 46 | happens, the only existing option is to fall back on the relatively slow
|
| 47 | and heavyweight L<XML::SAXDriver::CSV> module.
|
| 48 |
|
| 49 | L<Parse::CSV> fills this functionality gap. It provides a flexible
|
| 50 | and light-weight streaming parser for large, extremely large, or
|
| 51 | arbitrarily large CSV files.
|
| 52 |
|
| 53 | =head2 Main Features
|
| 54 |
|
| 55 | B<Stream-Based Parser> - All parsing a line at a time.
|
| 56 |
|
| 57 | B<Array Mode> - Parsing can be done in simple array mode, returning
|
| 58 | a reference to an array if the columns are not named.
|
| 59 |
|
| 60 | B<Hash Mode> - Parsing can be done in hash mode, putting the data into
|
| 61 | a hash and return a reference to it.
|
| 62 |
|
| 63 | B<Filter Capability> - All items returned can be passed through a
|
| 64 | custom filter. This filter can either modify the data on the fly,
|
| 65 | or drop records you don't need.
|
| 66 |
|
| 67 | =head2 Writing Filters
|
| 68 |
|
| 69 | A L<Parse::CSV> filter is a subroutine reference that is passed the raw
|
| 70 | record as C<$_>, and should C<return> the alternative or modified record
|
| 71 | to return to the user.
|
| 72 |
|
| 73 | The null filter (does not modify or drop any records) looks like the
|
| 74 | following.
|
| 75 |
|
| 76 | sub { $_ };
|
| 77 |
|
| 78 | A filter which reversed the order of the columns (assuming they are
|
| 79 | passed as an array) might look like the following.
|
| 80 |
|
| 81 | sub { return [ reverse @$_ ] };
|
| 82 |
|
| 83 | To drop the record, you should return C<undef> from the filter. The
|
| 84 | parser will then keep pulling and parsing new records until one
|
| 85 | passes the filter.
|
| 86 |
|
| 87 | # Only keep records where foo is true
|
| 88 | sub { $_->{foo} ? $_ : undef }
|
| 89 |
|
| 90 | To signal an error, throw an exception
|
| 91 |
|
| 92 | sub {
|
| 93 | $_->{foo} =~ /bar/ or die "Assumption failed";
|
| 94 | return $_;
|
| 95 | }
|
| 96 |
|
| 97 | =head1 METHODS
|
| 98 |
|
| 99 | =cut
|
| 100 |
|
| 101 | use 5.005;
|
| 102 | use strict;
|
| 103 | use Carp ();
|
| 104 | use IO::File ();
|
| 105 | use Text::CSV_XS ();
|
| 106 | use Params::Util qw{ _STRING _ARRAY _HASH0 _CODELIKE _HANDLE };
|
| 107 |
|
| 108 | use vars qw{$VERSION};
|
| 109 | BEGIN {
|
| 110 | $VERSION = '1.00';
|
| 111 | }
|
| 112 |
|
| 113 |
|
| 114 |
|
| 115 |
|
| 116 |
|
| 117 | #####################################################################
|
| 118 | # Constructor
|
| 119 |
|
| 120 | =pod
|
| 121 |
|
| 122 | =head2 new
|
| 123 |
|
| 124 | The C<new> constructor creates and initialise a new CSV parser.
|
| 125 |
|
| 126 | It takes a number of params.
|
| 127 |
|
| 128 | To specify the CSV data source, you should provide either the C<file>
|
| 129 | param, which should be the name of the file to read, or the C<handle>
|
| 130 | param, which should be a file handle to read instead.
|
| 131 |
|
| 132 | The actual parsing is done using L<Text::CSV_XS>. Any of it's
|
| 133 | constructor/parsing params can also be provided to this C<new> method,
|
| 134 | and they will be passed on.
|
| 135 |
|
| 136 | Alternatively, they can be passed as a single C<HASH> reference as the
|
| 137 | C<csv_attr> param. For example:
|
| 138 |
|
| 139 | $parser = Parse::CSV->new(
|
| 140 | file => 'file.csv',
|
| 141 | csv_attr => {
|
| 142 | sep_char => ';',
|
| 143 | quote_char => "'",
|
| 144 | },
|
| 145 | );
|
| 146 |
|
| 147 | An optional C<fields> param can be provided, which should be an array
|
| 148 | reference containing the names of the columns in the CSV file.
|
| 149 |
|
| 150 | $parser = Parse::CSV->new(
|
| 151 | file => 'file.csv',
|
| 152 | fields => [ 'col1', 'col2', 'col3' ],
|
| 153 | );
|
| 154 |
|
| 155 | If the C<fields> param is provided, the parser will map the columns to a
|
| 156 | hash where the keys are the field names provided, and the values are the
|
| 157 | values found in the CSV file.
|
| 158 |
|
| 159 | If the C<fields> param is B<not> provided, the parser will return simple
|
| 160 | array references of the columns.
|
| 161 |
|
| 162 | If the C<fields> param is the string 'auto', the fields will be
|
| 163 | automatically determined by reading the first line of the CSV file and
|
| 164 | using those values as the field names.
|
| 165 |
|
| 166 | The optional C<filter> param will be used to filter the records if
|
| 167 | provided. It should be a C<CODE> reference or any otherwise callable
|
| 168 | scalar, and each value parsed (either array reference or hash reference)
|
| 169 | will be passed to the filter to be changed or converted into an object,
|
| 170 | or whatever you wish.
|
| 171 |
|
| 172 | Returns a new L<Parse::CSV> object, or throws an exception (dies) on error.
|
| 173 |
|
| 174 | =cut
|
| 175 |
|
| 176 | sub new {
|
| 177 | my $class = shift;
|
| 178 | my $self = bless { @_,
|
| 179 | row => 0,
|
| 180 | errstr => '',
|
| 181 | }, $class;
|
| 182 |
|
| 183 | # Do we have a file name
|
| 184 | if ( exists $self->{file} ) {
|
| 185 | unless ( _STRING($self->{file}) ) {
|
| 186 | Carp::croak("Parse::CSV file param is not a string");
|
| 187 | }
|
| 188 | unless ( -f $self->{file} and -r _ ) {
|
| 189 | Carp::croak("Parse::CSV file '$self->{file}' does not exist");
|
| 190 | }
|
| 191 | $self->{handle} = IO::File->new();
|
| 192 | unless ( $self->{handle}->open($self->{file}) ) {
|
| 193 | Carp::croak("Parse::CSV file '$self->{file}' failed to load: $!");
|
| 194 | }
|
| 195 | }
|
| 196 |
|
| 197 | # Do we have a file handle
|
| 198 | if ( exists $self->{handle} ) {
|
| 199 | unless ( _HANDLE($self->{handle}) ) {
|
| 200 | Carp::croak("Parse::CSV handle param is not an IO handle");
|
| 201 | }
|
| 202 | } else {
|
| 203 | Carp::croak("Parse::CSV not provided a file or handle param");
|
| 204 | }
|
| 205 |
|
| 206 | # Seperate the Text::CSV attributes
|
| 207 | unless ( _HASH0($self->{csv_attr}) ) {
|
| 208 | $self->{csv_attr} = {};
|
| 209 | foreach ( qw{quote_char eol escape_char sep_char binary always_quote} ) {
|
| 210 | next unless exists $self->{$_};
|
| 211 | $self->{csv_attr}->{$_} = delete $self->{$_};
|
| 212 | }
|
| 213 | }
|
| 214 |
|
| 215 | # Create the parser
|
| 216 | $self->{csv_xs} = Text::CSV_XS->new( $self->{csv_attr} );
|
| 217 | unless ( $self->{csv_xs} ) {
|
| 218 | Carp::croak("Failed to create Text::CSV_XS parser");
|
| 219 | }
|
| 220 |
|
| 221 | # Handle automatic fields
|
| 222 | if ( _STRING($self->{fields}) and lc($self->{fields}) eq 'auto' ) {
|
| 223 | # Grab the first line
|
| 224 | my $line = $self->_getline;
|
| 225 | unless ( defined $line ) {
|
| 226 | Carp::croak("Failed to get header line from CSV");
|
| 227 | }
|
| 228 |
|
| 229 | # Parse the line into columns
|
| 230 | unless ( $self->{csv_xs}->parse($line) ) {
|
| 231 | Carp::croak(
|
| 232 | "Failed to parse header line from CSV: "
|
| 233 | . $self->{csv_xs}->error_input
|
| 234 | );
|
| 235 | }
|
| 236 |
|
| 237 | # Turn the array ref into a hash if needed
|
| 238 | my @cols = $self->{csv_xs}->fields;
|
| 239 | $self->{fields} = \@cols;
|
| 240 | }
|
| 241 |
|
| 242 | # Check fields
|
| 243 | if ( exists $self->{fields} and ! _ARRAY($self->{fields}) ) {
|
| 244 | Carp::croak("Parse::CSV fields param is not an array reference of strings");
|
| 245 | }
|
| 246 |
|
| 247 | # Check filter
|
| 248 | if ( exists $self->{filter} and ! _CODELIKE($self->{filter}) ) {
|
| 249 | Carp::croak("Parse::CSV filter param is not callable");
|
| 250 | }
|
| 251 |
|
| 252 | $self;
|
| 253 | }
|
| 254 |
|
| 255 |
|
| 256 |
|
| 257 |
|
| 258 |
|
| 259 | #####################################################################
|
| 260 | # Main Methods
|
| 261 |
|
| 262 | =pod
|
| 263 |
|
| 264 | =head2 fetch
|
| 265 |
|
| 266 | Once a L<Parse::CSV> object has been created, the C<fetch> method is
|
| 267 | used to parse and return the next value from the CSV file.
|
| 268 |
|
| 269 | Returns an C<ARRAY>, C<HASH> or the output of the filter, based on the
|
| 270 | configuration of the object, or C<undef> in a variety of situations.
|
| 271 |
|
| 272 | Returning C<undef> means either some part of the parsing and filtering
|
| 273 | process has resulted in an error, B<or> that the end of file has been
|
| 274 | reached.
|
| 275 |
|
| 276 | On receiving C<undef>, you should the C<errstr> method. If it is a null
|
| 277 | string you have reached the end of file. Otherwise the error message will
|
| 278 | be returned. Thus, the basic usage of L<Parse::CSV> will look like the
|
| 279 | following.
|
| 280 |
|
| 281 | my $parser = Parse::CSV->new(
|
| 282 | file => 'file.csv',
|
| 283 | );
|
| 284 | while ( my $value = $parser->fetch ) {
|
| 285 | # Do something...
|
| 286 | }
|
| 287 | if ( $parser->errstr ) {
|
| 288 | # Handle errors...
|
| 289 | }
|
| 290 |
|
| 291 | =cut
|
| 292 |
|
| 293 | sub fetch {
|
| 294 | my $self = shift;
|
| 295 |
|
| 296 | # The filter can skip rows,
|
| 297 | # iterate till we get something.
|
| 298 | while ( defined(my $line = $self->_getline) ) {
|
| 299 | # Parse the line into columns
|
| 300 | unless ( $self->{csv_xs}->parse($line) ) {
|
| 301 | $self->{errstr} = "Failed to parse row $self->{row}";
|
| 302 | return undef;
|
| 303 | }
|
| 304 |
|
| 305 | # Turn the array ref into a hash if needed
|
| 306 | my $rv = undef;
|
| 307 | my $f = $self->{fields};
|
| 308 | my @cols = $self->{csv_xs}->fields;
|
| 309 | if ( $f ) {
|
| 310 | $rv = {};
|
| 311 | foreach ( 0 .. $#$f ) {
|
| 312 | $rv->{ $f->[$_] } = $cols[$_];
|
| 313 | }
|
| 314 | } else {
|
| 315 | $rv = \@cols;
|
| 316 | }
|
| 317 |
|
| 318 | # Just return for simple uses
|
| 319 | return $rv unless $self->{filter};
|
| 320 |
|
| 321 | # Filter if needed
|
| 322 | local $_ = $rv;
|
| 323 | $rv = eval { $self->{filter}->() };
|
| 324 | if ( $@ ) {
|
| 325 | # Handle filter errors
|
| 326 | $self->{errstr} = "Filter error: $@";
|
| 327 | $self->{errstr} =~ s/^(.+)at line.+$/$1/;
|
| 328 | return undef;
|
| 329 | }
|
| 330 |
|
| 331 | # Filter returns undef to drop a record
|
| 332 | next unless defined $rv;
|
| 333 |
|
| 334 | # We have a good record, return it
|
| 335 | return $rv;
|
| 336 | }
|
| 337 |
|
| 338 | return undef;
|
| 339 | }
|
| 340 |
|
| 341 | sub _getline {
|
| 342 | my $self = shift;
|
| 343 | $self->{errstr} = '';
|
| 344 |
|
| 345 | # Fetch the next file line
|
| 346 | my $handle = $self->{handle};
|
| 347 | my $line = <$handle>;
|
| 348 | unless ( defined $line ) {
|
| 349 | $self->{errstr} = $handle->eof ? '' : $!;
|
| 350 | return undef;
|
| 351 | }
|
| 352 |
|
| 353 | # Parse the line into columns
|
| 354 | $self->{row}++;
|
| 355 | return $line;
|
| 356 | }
|
| 357 |
|
| 358 | =pod
|
| 359 |
|
| 360 | =head2 row
|
| 361 |
|
| 362 | The C<row> method returns the current row of the CSV file.
|
| 363 |
|
| 364 | This is a one-based count, so when you first create the parser,
|
| 365 | the value of C<row> will be zero (unless you are using
|
| 366 | C<fields => 'auto'> in which case it will be 1).
|
| 367 |
|
| 368 | =cut
|
| 369 |
|
| 370 | sub row {
|
| 371 | $_[0]->{row};
|
| 372 | }
|
| 373 |
|
| 374 | =pod
|
| 375 |
|
| 376 | =head2 combine
|
| 377 |
|
| 378 | $status = $csv->combine(@columns);
|
| 379 |
|
| 380 | The C<combine> method is provided as a convenience, and is passed through
|
| 381 | to the underlying L<Text::CSV_XS> object.
|
| 382 |
|
| 383 | =cut
|
| 384 |
|
| 385 | sub combine {
|
| 386 | shift->combine(@_);
|
| 387 | }
|
| 388 |
|
| 389 | =pod
|
| 390 |
|
| 391 | =head2 string
|
| 392 |
|
| 393 | $line = $cvs->string;
|
| 394 |
|
| 395 | The C<string> method is provided as a convenience, and is passed through
|
| 396 | to the underlying L<Text::CSV_XS> object.
|
| 397 |
|
| 398 | =cut
|
| 399 |
|
| 400 | sub string {
|
| 401 | shift->string(@_);
|
| 402 | }
|
| 403 |
|
| 404 | =pod
|
| 405 |
|
| 406 | =head2 print
|
| 407 |
|
| 408 | $status = $cvs->print($io, $columns);
|
| 409 |
|
| 410 | The C<print> method is provided as a convenience, and is passed through
|
| 411 | to the underlying L<Text::CSV_XS> object.
|
| 412 |
|
| 413 | =cut
|
| 414 |
|
| 415 | sub print {
|
| 416 | shift->print(@_);
|
| 417 | }
|
| 418 |
|
| 419 | =pod
|
| 420 |
|
| 421 | =head2 errstr
|
| 422 |
|
| 423 | On error, the C<errstr> method returns the error that occured.
|
| 424 |
|
| 425 | If the last action was NOT an error, returns the null string C<''>.
|
| 426 |
|
| 427 | =cut
|
| 428 |
|
| 429 | sub errstr {
|
| 430 | $_[0]->{errstr};
|
| 431 | }
|
| 432 |
|
| 433 | 1;
|
| 434 |
|
| 435 | =pod
|
| 436 |
|
| 437 | =head1 SUPPORT
|
| 438 |
|
| 439 | Bugs should be always be reported via the CPAN bug tracker at
|
| 440 |
|
| 441 | L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Parse-CSV>
|
| 442 |
|
| 443 | For other issues, or commercial enhancement or support, contact the author.
|
| 444 |
|
| 445 | =head1 AUTHORS
|
| 446 |
|
| 447 | Adam Kennedy E<lt>adamk@cpan.orgE<gt>
|
| 448 |
|
| 449 | =head1 SEE ALSO
|
| 450 |
|
| 451 | L<Text::CSV_XS>, L<http://ali.as/>
|
| 452 |
|
| 453 | =head1 COPYRIGHT
|
| 454 |
|
| 455 | Copyright 2006 Adam Kennedy.
|
| 456 |
|
| 457 | This program is free software; you can redistribute
|
| 458 | it and/or modify it under the same terms as Perl itself.
|
| 459 |
|
| 460 | The full text of the license can be found in the
|
| 461 | LICENSE file included with this module.
|
| 462 |
|
| 463 | =cut
|