Newbie question: an extended comma or tab separated (CSV/TSV) Ragel scanner

Heiko heiko.pael... at gmail.com
Sun Jun 29 13:24:48 UTC 2008


Hi Erich,

many thanks!
I had a look at your code, but realise that you are using a Ragel
state machine, whereas I am trying to use a scanner.
Also, for full CSV files, there is the complication that quoted items
can contain newline characters, which need to be treated separately.
Do you have any idea why my implementation swallows one character
after a newline \n, but not after \r?

Many thanks for your help!

Cheers,
Heiko


On Jun 28, 5:01 pm, Erich Ocean <er... at atlasocean.com> wrote:
> Heiko,
>
> Not sure if this will help, but this is from a PDF 1.4 parsing machine  
> I wrote years ago:
>
> %% PDFParsingMachine
>
> alphtype unsigned char;
>
> # the whitespace, eol, delimiter, regular, and comment machines
>
> # whitespace in a PDF file includes the NULL character,
> # and consecutive whitespace is treated as one
> whitespace = /[\t\f\n\r\0 ]/ ;
>
> eol = /[\r\n]/ | '\r\n' ;
>
> delimiter = [()<>\[\]{}/%] ;
>
> regular = any - ( whitespace | delimiter ) ;
>
> # The priority bump on the terminator of the comments brings us
> # out of the extend* which matches everything.
> comment = '%' . extend* $0 . eol @1 ;
>
> #
> # (Other machines not shown.)
> #
>
> main := (
>         whitespace |
>         comment |
>         boolean |
>         number |
>         hexString |
>         name |
>         literalString |
>         beginArray |
>         endArray |
>         beginDict |
>         endDict |
>         pdfNull |
>         stream |
>         beginIndirectObject |
>         endIndirectObject |
>         indirectObjectReference |
>         beginXref |
>         beginTrailer |
>         beginStartxref |
>         beginFree |
>         beginInUse )**;
>
> %%
>
> Best, Erich
>
> On Jun 28, 2008, at 8:52 AM, Heiko wrote:
>
>
>
> > Dear list members,
>
> > I am trying to implement a CSV scanner based on the fantastic Ragel,
> > with a small few modifications to the standard:
>
> > - it should work for different types of Unix/Mac/Windows line endings
> > ( \n, \r, \r\n)
> > - it should use both commas and tabs as item separators
> > - it should honour quoted values
> > - it should collapse multiple empty lines into single line
> > separators ...
>
> > I have written the following ragel code (below) to accomplish this,
> > but there is a problem that I cannot locate ...
> > When the line endings are \r (CR), everything seems to work fine,
> > however, if they are \n (LF), the first character of the next field is
> > swallowed by the scanner ....
> > I am pulling my hair out, and suspect it has to do with ambiguities in
> > the definition of the scanner. I am also unsure as to which transition
> > action I should choose (currently '@').
>
> > Does anyone have any ideas?
>
> > Cheers,
> > Heiko
>
> > //
> > //  csv_parse.m
> > //  RagelCsv
> > //
> > //
>
> > #define HPDEBUG 1
> > #import <Foundation/Foundation.h>
>
> > void csv_parse(unsigned char *input, size_t len, NSMutableArray
> > **parsedData, long *maxcols, long *nlines, long *nrecords) {
> >    long tempmaxcols=0, tempnrecords=0;
> >    int cs=0, act, curline = 1; //ragel variables to keep states
> >    unsigned char *tokstart = NULL, *tokend = NULL; //ragel variables
> > for Scanner
> >    unsigned char *p = input, *pe = input + len; //ragel variables to
> > keep track of position in stream
> >    NSMutableArray *row=[[NSMutableArray alloc] init]; //Array to hold
> > elements from each row/record
> >    NSMutableArray *tempParsedData=[[NSMutableArray alloc] init]; //
> > Array of row arrays
> >    NSString *coldata; //string that holds entry in field
> >    NSMutableString *tempInputString=[NSString stringWithUTF8String:(char
> > *)input];
>
> > //Discussion
> > //On UNIX, text file line-endings are terminated with a newline (n),
> > also referred to as a linefeed (LF).
> > //On Windows, line-endings are terminated with a combination of a
> > carriage return (r) and a newline(n), also referred to as CR/LF.
> > //On the Mac Classic, line-endings are terminated with a single
> > carriage return (CR). (Mac OS X uses the UNIX convention.)
>
> > //A line is delimited by any of these characters, the longest possible
> > sequence being preferred to any shorter:
> > //U+000D (\r or CR) //U+2028 (Unicode line separator) //U+000A (\n or
> > LF) //U+2029 (Unicode paragraph separator)
> > // \r\n, in that order (also known as CRLF)
>
> > //append end of line if not present so we can obtain all records.
> > [row autorelease];
>
> > %%{
> >    machine csv_scan;
> >    alphtype unsigned char;
>
> >    newline =('\r\n') | ('\n') | ('\r') %{
> >         curline += 1;
> >    };
> >    multiline =(('\r\n') | ('\n') | ('\r')).(('\r\n') | ('\n') | ('\r'))
> > +  @{
> >         curline += 1;
> >    };
>
> >    ws = ' ';
> >    Separator = [,\t];
> >    UnQuotedValue = [^ \t",\r\n].[^"\t,\r\n]*;
> >    QuotedChar = ( '""' | [^"] | (newline|multiline) );
> >    QuotedValue = '"' . QuotedChar* . '"';
>
> >    main := |*
> >      ws;
> >      multiline     @{
> >            if ([coldata length]==0)
> >                    coldata=(NSString *)[NSNull null];
> >            [row addObject:coldata];
> >            coldata=nil;
> >            tempnrecords++;
> >            if(!row) row=[NSMutableArray arrayWithObject:[NSNull null]];
> >            [tempParsedData addObject:row];
> >            if ([row count] >tempmaxcols) tempmaxcols=[row count];
> >            row=[NSMutableArray array];
> >            if (HPDEBUG) NSLog(@"multiline");
> >      };
> >      newline  @{
> >            if ([coldata length]==0)
> >                    coldata=(NSString *)[NSNull null];
> >            [row addObject:coldata];
> >            coldata=nil;
> >            tempnrecords++;
> >            if(!row) row=[NSMutableArray arrayWithObject:[NSNull null]];
> >            [tempParsedData addObject:row];
> >            if ([row count] >tempmaxcols) tempmaxcols=[row count];
> >            row=[NSMutableArray array];
> >            if (HPDEBUG) NSLog(@"newline");
> >      };
> >      Separator {
> >            if ([coldata length]==0)
> >                    coldata=(NSString *)[NSNull null];
> >            [row addObject:coldata];
> >            tempnrecords++;
> >            coldata=nil;
> > if (HPDEBUG) NSLog(@"separator");
> >      };
>
> >      UnQuotedValue {
> >          unsigned char ch, *endp;
> >          int datalen;
> >          datalen = tokend - tokstart;
> >          endp = tokend - 1;
> >          while(datalen>0) {
> >              ch = *endp--;
> > /*              if (ch==' ' || ch=='\t') {*/
> >              if (ch==' ') {
> >                  datalen--;
> >              } else {
> >                  break;
> >              }
> >          }
>
> >          if (datalen==0) {
> >                            coldata = (NSString *)[NSNull null];
> >          } else {
> >                            coldata=[NSString stringWithString:[tempInputString
> > substringWithRange:NSMakeRange((int)(tokstart-input), datalen)]];
> >          }
> >     if (HPDEBUG)                             NSLog(@"Unquoted value: %@",coldata);
> >      };
> >      QuotedValue {
> >          unsigned char ch, *start_p, *wptr, *rptr;
> >          int rest, datalen;
> >          start_p = wptr = tokstart;
> >          rptr = tokstart + 1;
> >          rest = tokend - tokstart - 2;
> >          datalen = 0;
> >          while(rest>0) {
> >              ch = *rptr++;
> >              if (ch=='"') {
> >                rptr++;
> >                rest--;
> >              }
> >              *wptr++ = ch;
> >              datalen++;
> >              rest--;
> >          }
> >              tempInputString=[NSString stringWithUTF8String:(char *)input]; //
> > reset tempInputString after messing with chars in input
> >              coldata=[NSString stringWithString:[tempInputString
> > substringWithRange:NSMakeRange((int)(start_p-input), datalen)]];
> >    if (HPDEBUG)                                            NSLog(@"  Quoted value: %@",coldata);
> >       };
> >    *|;
> > }%%
> > %% write data nofinal;
>
> > %%write init;
> > %%write exec;
> > %%write eof;
>
> >    if(row) {
> > //   [tempParsedData addObject:row];
> > //   if ([row count] >tempmaxcols) tempmaxcols=[row count];
> > //   [row autorelease];
> >      row=nil;
> >      }
>
> >    *parsedData=tempParsedData;
>
> > if ( cs == csv_scan_error ) {
> >           NSLog(@"CSVscan parse error on line %d.", curline);
> >        }
>
> >    *nrecords=tempnrecords;
> >    *nlines=curline-1;
> >    *maxcols=tempmaxcols;
>



More information about the ragel-users mailing list