[ragel-users] Newbie question: an extended comma or tab separated (CSV/TSV) Ragel scanner

Erich Ocean er... at atlasocean.com
Sat Jun 28 16:01:29 UTC 2008


Heiko,

Not sure if this will help, but this is from a PDF 1.4 parsing machine  
I wrote years ago:

%% PDFParsingMachine

alphtype unsigned char;

# the whitespace, eol, delimiter, regular, and comment machines
	
# whitespace in a PDF file includes the NULL character,
# and consecutive whitespace is treated as one
whitespace = /[\t\f\n\r\0 ]/ ;

eol = /[\r\n]/ | '\r\n' ;
	
delimiter = [()<>\[\]{}/%] ;
	
regular = any - ( whitespace | delimiter ) ;

# The priority bump on the terminator of the comments brings us
# out of the extend* which matches everything.
comment = '%' . extend* $0 . eol @1 ;

#
# (Other machines not shown.)
#

main := (
	whitespace |
	comment |
	boolean |
	number |
	hexString |
	name |
	literalString |
	beginArray |
	endArray |
	beginDict |
	endDict |
	pdfNull |
	stream |
	beginIndirectObject |
	endIndirectObject |
	indirectObjectReference |
	beginXref |
	beginTrailer |
	beginStartxref |
	beginFree |
	beginInUse )**;

%%

Best, Erich

On Jun 28, 2008, at 8:52 AM, Heiko wrote:

>
> Dear list members,
>
> I am trying to implement a CSV scanner based on the fantastic Ragel,
> with a small few modifications to the standard:
>
> - it should work for different types of Unix/Mac/Windows line endings
> ( \n, \r, \r\n)
> - it should use both commas and tabs as item separators
> - it should honour quoted values
> - it should collapse multiple empty lines into single line
> separators ...
>
> I have written the following ragel code (below) to accomplish this,
> but there is a problem that I cannot locate ...
> When the line endings are \r (CR), everything seems to work fine,
> however, if they are \n (LF), the first character of the next field is
> swallowed by the scanner ....
> I am pulling my hair out, and suspect it has to do with ambiguities in
> the definition of the scanner. I am also unsure as to which transition
> action I should choose (currently '@').
>
> Does anyone have any ideas?
>
> Cheers,
> Heiko
>
>
>
>
>
>
>
> //
> //  csv_parse.m
> //  RagelCsv
> //
> //
>
>
>
> #define HPDEBUG 1
> #import <Foundation/Foundation.h>
>
> void csv_parse(unsigned char *input, size_t len, NSMutableArray
> **parsedData, long *maxcols, long *nlines, long *nrecords) {
>    long tempmaxcols=0, tempnrecords=0;
>    int cs=0, act, curline = 1; //ragel variables to keep states
>    unsigned char *tokstart = NULL, *tokend = NULL; //ragel variables
> for Scanner
> 	unsigned char *p = input, *pe = input + len; //ragel variables to
> keep track of position in stream
>    NSMutableArray *row=[[NSMutableArray alloc] init]; //Array to hold
> elements from each row/record
> 	NSMutableArray *tempParsedData=[[NSMutableArray alloc] init]; //
> Array of row arrays
> 	NSString *coldata; //string that holds entry in field
> 	NSMutableString *tempInputString=[NSString stringWithUTF8String:(char
> *)input];
>
> //Discussion
> //On UNIX, text file line-endings are terminated with a newline (n),
> also referred to as a linefeed (LF).
> //On Windows, line-endings are terminated with a combination of a
> carriage return (r) and a newline(n), also referred to as CR/LF.
> //On the Mac Classic, line-endings are terminated with a single
> carriage return (CR). (Mac OS X uses the UNIX convention.)
>
> //A line is delimited by any of these characters, the longest possible
> sequence being preferred to any shorter:
> //U+000D (\r or CR) //U+2028 (Unicode line separator) //U+000A (\n or
> LF) //U+2029 (Unicode paragraph separator)
> // \r\n, in that order (also known as CRLF)
>
>
> //append end of line if not present so we can obtain all records.
> [row autorelease];
>
> %%{
>    machine csv_scan;
> 	alphtype unsigned char;
>
> 	newline =('\r\n') | ('\n') | ('\r') %{
>         curline += 1;
>    };
> 	multiline =(('\r\n') | ('\n') | ('\r')).(('\r\n') | ('\n') | ('\r'))
> +  @{
>         curline += 1;
>    };
>
> 	ws = ' ';
>    Separator = [,\t];
> 	UnQuotedValue = [^ \t",\r\n].[^"\t,\r\n]*;
> 	QuotedChar = ( '""' | [^"] | (newline|multiline) );
>    QuotedValue = '"' . QuotedChar* . '"';
>
>    main := |*
>      ws;
>      multiline     @{
> 		if ([coldata length]==0)
> 			coldata=(NSString *)[NSNull null];
> 		[row addObject:coldata];
> 		coldata=nil;
> 		tempnrecords++;
> 		if(!row) row=[NSMutableArray arrayWithObject:[NSNull null]];
> 		[tempParsedData addObject:row];
> 		if ([row count] >tempmaxcols) tempmaxcols=[row count];
> 		row=[NSMutableArray array];
> 		if (HPDEBUG) NSLog(@"multiline");
> 	  };
>      newline  @{
> 		if ([coldata length]==0)
> 			coldata=(NSString *)[NSNull null];
> 		[row addObject:coldata];
> 		coldata=nil;
> 		tempnrecords++;
> 		if(!row) row=[NSMutableArray arrayWithObject:[NSNull null]];
> 		[tempParsedData addObject:row];
> 		if ([row count] >tempmaxcols) tempmaxcols=[row count];
> 		row=[NSMutableArray array];
> 		if (HPDEBUG) NSLog(@"newline");
> 	  };
>      Separator {
> 		if ([coldata length]==0)
> 			coldata=(NSString *)[NSNull null];
> 		[row addObject:coldata];
> 		tempnrecords++;
> 		coldata=nil;
> if (HPDEBUG) NSLog(@"separator");
> 	  };
>
>      UnQuotedValue {
>          unsigned char ch, *endp;
>          int datalen;
>          datalen = tokend - tokstart;
>          endp = tokend - 1;
>          while(datalen>0) {
>              ch = *endp--;
> /*              if (ch==' ' || ch=='\t') {*/
>              if (ch==' ') {
>                  datalen--;
>              } else {
>                  break;
>              }
>          }
>
>          if (datalen==0) {
> 				coldata = (NSString *)[NSNull null];
>          } else {
> 				coldata=[NSString stringWithString:[tempInputString
> substringWithRange:NSMakeRange((int)(tokstart-input), datalen)]];
>          }
> 	 if (HPDEBUG)		  		  NSLog(@"Unquoted value: %@",coldata);
>      };
>      QuotedValue {
>          unsigned char ch, *start_p, *wptr, *rptr;
>          int rest, datalen;
>          start_p = wptr = tokstart;
>          rptr = tokstart + 1;
>          rest = tokend - tokstart - 2;
>          datalen = 0;
>          while(rest>0) {
>              ch = *rptr++;
>              if (ch=='"') {
>                rptr++;
>                rest--;
>              }
>              *wptr++ = ch;
>              datalen++;
>              rest--;
>          }
> 		  tempInputString=[NSString stringWithUTF8String:(char *)input]; //
> reset tempInputString after messing with chars in input
> 		  coldata=[NSString stringWithString:[tempInputString
> substringWithRange:NSMakeRange((int)(start_p-input), datalen)]];
> 	if (HPDEBUG)						NSLog(@"  Quoted value: %@",coldata);
>       };
>    *|;
> }%%
> %% write data nofinal;
>
>
> %%write init;
> %%write exec;
> %%write eof;
>
>    if(row) {
> //	  [tempParsedData addObject:row];
> //	  if ([row count] >tempmaxcols) tempmaxcols=[row count];
> //	  [row autorelease];
> 	  row=nil;
> 	  }
>
> 	*parsedData=tempParsedData;
>
>
> if ( cs == csv_scan_error ) {
>           NSLog(@"CSVscan parse error on line %d.", curline);
>        }
>
> 	*nrecords=tempnrecords;
> 	*nlines=curline-1;
> 	*maxcols=tempmaxcols;
>
> }
>
>
>
> >



More information about the ragel-users mailing list