[ragel-users] parser getting gigantic ?

M P buserror at gmail.com
Thu Sep 2 14:41:36 UTC 2010


I decided to play with Ragel by making a parser for an extended
version of JSON.

After falling into every single pitfall possible, I managed to get the
whole thing working pretty well, but I just had a look at the
generated code and it reached 600KB total, using -G1 (!)

So, what am I doing wrong ? I'm certain there's probably a lot of
stuff wrong with the way I made the parser, but really, I'm not sure
what so having experts eyeballing this would be very nice...

There are 2 different json parsers in the file; one is for the string
constants, one (the main one) is for the language proper...


/*
 * IF YOU ARE LOOKING AT A .c FILE, YOUR ARE LOOKING AT THE WRONG ONE
 *
 * This file is autogenerated from a .rl source file for 'ragel'
parser generator.
 */
#include <stdlib.h>
#include <string.h>
#include <stdio.h>

#include "json.h"

%%{
	machine json_str;
	write data;
}%%

static char * json_append_utf8_glyph(
	char * dst,
	unsigned long inUnicode )
{
	if (!(inUnicode & ~0x7f)) {
		*dst++ = ((char)inUnicode);
		return dst;	// that was easy
	}
	unsigned char *cur = dst;

	unsigned long currentMask = ~0x7ff;
	int bits = 6;
	int header = 5;

	while ((inUnicode & currentMask) && bits <= 24) {
		currentMask = currentMask << 6;
		bits += 6; header--;
	}
	*cur++ = (0xfe << header) | (unsigned char)(inUnicode >> (bits));
	bits -= 6;
	while (bits >= 0) {
		*cur++ = 0x80 | ((unsigned char)(inUnicode >> bits) & 0x3f);
		bits -= 6;
	}
	return cur;
}

int json_parse_string(char * str, char *end, char * out)
{
	char *p = str, *pe = end ? end : str + strlen( str ), *eof = pe;
	int cs;
	out = out ? out : str;
	uint16_t u;
	%%{
		machine json_str;

		xxdigit = (
			([0-9] @{ u = (u << 4) | fc - '0'; }) |
			([a-f] @{ u = (u << 4) | fc - 'a' + 0xa; }) |
			([A-F] @{ u = (u << 4) | fc - 'A' + 0xa; })			
		);
		utf16 = ( xxdigit{4} ) >{ u = 0; } @{ out = json_append_utf8_glyph(out, u); };
		
		normal = any @{*out++ = fc;};
		escape =
				('\\' %{ *out++ = '\\'; } ) |
				('t' %{ *out++ = '\t'; } )  |
				('b' %{ *out++ = '\b'; } )  |
				('f' %{ *out++ = '\f'; } )  |
				('n' %{ *out++ = '\n'; } )  |
				('r' %{ *out++ = '\r'; } )  |
				('u' utf16 ) |
				( normal -- [\\tbfntu] )
			;
		main := (
			('\\' escape) |
			( normal -- '\\' )
		)*;
		
		# Initialize and execute.
		write init;
		write exec;
	}%%
	*out = 0;
	
	return 0;
}

%%{
	machine json;
	write data;
}%%

int json_parse( json_driver_t *d, char * str )
{
	char *p = str, *pe = str + strlen( str ), *eof = pe;
	int cs;
	int stack[32], top = 0;
	int integer_sign;	// for integer decode
	char * float_start;
	json_driver_value_t v;
	uint32_t b64;
	int b64_cnt;
	
	%%{
		machine json;
		action obj_field_list_start { d->open_object(d); }
		action obj_field_list_done { d->close_object(d); }
		action obj_value_list_start { d->open_array(d); }
		action obj_value_list_done { d->close_array(d); }
		action obj_create_name { d->set_name(d, &v); }
		action obj_set_flag { if (d->add_flag) d->add_flag(d, &v); }
		action obj_set_string { d->set_value(d, json_driver_type_string, &v); }
		action obj_set_integer { d->set_value(d, json_driver_type_integer, &v); }
		action obj_set_float { d->set_value(d, json_driver_type_float, &v); }
		action obj_set_hex { d->set_value(d, json_driver_type_hex, &v); }
		action obj_set_true { v.u.v_bool = 1; d->set_value(d,
json_driver_type_bool, &v); }
		action obj_set_false { v.u.v_bool = 0; d->set_value(d,
json_driver_type_bool, &v); }
		action obj_set_null { d->set_value(d, json_driver_type_null, NULL); }
		
		action obj_start_data { if (d->open_data) d->open_data(d); }
		action obj_flush_data { if (d->add_data) for (int s=16,i = 0;
i<b64_cnt; i++,s-=8) d->add_data(d, (b64 >> s) & 0xff); }
		action obj_end_data { if (d->close_data) d->close_data(d); }
		
		W = [ \t\n]**;

		#
		# quoted or unquoted string
		#
		action str_init { v.u.v_str.start = v.u.v_str.end = fpc; }
		action str_done { v.u.v_str.end = fpc; }
			
		string = '"' ((([^"] | '\"')*) >str_init %str_done)  '"';
		ident = ((alpha | '_') (alnum | '_')*) >str_init %str_done;
		
		#
		#	negative/positive Integer
		#
		action integer_init { v.u.v_int = 0; integer_sign = 1; }
		action integer_minus { integer_sign = -1; }
		action integer_digit { v.u.v_int = (v.u.v_int * 10) + (fc - '0'); }
		action integer_done {  v.u.v_int *= integer_sign; }
		
		integer = (('-' @integer_minus | '+')? (digit+ @integer_digit))
			>integer_init %integer_done;
		
		#
		# hex integer
		#
		xxdigit = (
			([0-9] @{ v.u.v_int = (v.u.v_int << 4) | fc - '0'; }) |
			([a-f] @{ v.u.v_int = (v.u.v_int << 4) | fc - 'a' + 0xa; }) |
			([A-F] @{ v.u.v_int = (v.u.v_int << 4) | fc - 'A' + 0xa; })			
		);
		hex = (('-' @integer_minus | '+')?( '0x' xxdigit+))
			>integer_init %integer_done;
		
		#
		# float/double value
		#
		action float_init { float_start = fpc; }
		action float_done { sscanf(float_start, "%lf", &v.u.v_float); }
		#
		# float values
		#
		float = (
			('-' | '+')? digit* '.' digit+ [fd]?
		) >float_init %float_done;

		#
		# base64 decoder
		#
		base64_char = (
			([A-Z] @{ b64 = (b64 << 6) | (fc - 'A'); }) |
			([a-z] @{ b64 = (b64 << 6) | (fc - 'a' + 26 ); }) |
			([0-9] @{ b64 = (b64 << 6) | (fc - '0' + 52 ); }) |
			('+' @{ b64 = (b64 << 6) | 62; }) |
			('/' @{ b64 = (b64 << 6) | 63; })
		);
		base64_pad = '=' @{ b64 = (b64 << 6); };
		base64_four = (
			base64_char base64_char base64_char base64_char
		) %{ b64_cnt = 3; } %obj_flush_data;
		base64_padder = (
			base64_char base64_char
			(
				(( base64_char base64_pad )
					%{ b64_cnt = 2; } ) |
				(( base64_pad base64_pad )
					%{ b64_cnt = 1; } )
			)
		) %obj_flush_data;

		base64 = ( base64_four** (base64_four | base64_padder) ) >{b64 = 0;}
				%err{ printf("### base64 Error : '%s'\n", p); };

		#
		# JSON value, extended
		#
		json_value = (
			(string %obj_set_string) |
			(integer %obj_set_integer) |
			(hex %obj_set_hex ) |
			(float %obj_set_float) |
			('true' %obj_set_true) |
			('false' %obj_set_false) |
			('null' %obj_set_null) |
			('{' @{ fhold; fcall obj_field_list; } ) |
			('[' @{ fhold; fcall json_value_list; } ) |
			(('%' (W base64)* W '%') >obj_start_data %obj_end_data)
		);
		
		json_value_list := (
			'[' (
				'' |
				(W json_value (W ',' W json_value)* )
			) W ','? W ']'
		) >obj_value_list_start @obj_value_list_done @{ fret; }
				%err{ printf("### Array[%d] Error : '%s'\n", top, p); };
		
		obj_field_flag = ( ident ) %obj_set_flag;
		obj_field_flags = (
			'(' W obj_field_flag (W ',' W obj_field_flag)** ')'
		);
		obj_field = ((string | ident) %obj_create_name) W obj_field_flags? W
':' W json_value;
		
		obj_field_list := (
			'{' (
				'' |
				(W obj_field (W ',' W obj_field)** )
			) W ','? W '}'
		) >obj_field_list_start @obj_field_list_done @{ fret; }
				%err{ printf("### Object[%d] Error : '%s'\n", top, p); };
		
		main := (
			W json_value
		) %err{ printf("### JSON Error : '%s'\n", p); };

		# Initialize and execute.
		write init;
		write exec;
	}%%

	return 0;
};

#ifdef JSON_TEST_UNIT
static void d_set_name(struct json_driver_t *d,
		json_driver_value_t * v)
{
	int l = v->u.v_str.end - v->u.v_str.start;
	printf("\"%*.*s\": ", l, l, v->u.v_str.start);
}

static void d_open_array(struct json_driver_t *d)
{
	printf("[");fflush(stdout);
}

static void d_open_object(struct json_driver_t *d)
{
	printf("{");fflush(stdout);
}

static void d_set_value(struct json_driver_t *d,
		int type,
		json_driver_value_t * v)
{
	switch (type) {
		case json_driver_type_null:
			printf("null, ");
			break;
		case json_driver_type_bool:
			printf("%s, ", v->u.v_bool ? "true" : "false");
			break;
		case json_driver_type_integer:
			printf("%d, ", (int)v->u.v_int);
			break;
		case json_driver_type_hex:
			printf("0x%x, ", (int)v->u.v_int);
			break;
		case json_driver_type_float:
			printf("%f, ", (float)v->u.v_float);
			break;
		case json_driver_type_string: {
			char buf[256];
			json_parse_string(v->u.v_str.start, v->u.v_str.end, buf);
			printf("\"%s\": ", buf);
		}	break;
	}
	fflush(stdout);
}

static void d_close_array(struct json_driver_t *d)
{
	printf("],");fflush(stdout);
}

static void d_close_object(struct json_driver_t *d)
{
	printf("},");fflush(stdout);
}

static void d_open_data(struct json_driver_t *d)
{
	printf("%% '");fflush(stdout);	
}
static void d_add_data(struct json_driver_t *d, uint8_t data)
{
	printf("%c",data);fflush(stdout);
}
static void d_close_data(struct json_driver_t *d)
{
	printf("' %%,");fflush(stdout);
}

json_driver_t driver = {
	.set_name = d_set_name,
	.open_array = d_open_array,
	.open_object = d_open_object,
	.set_value = d_set_value,
	.close_array = d_close_array,
	.close_object = d_close_object,
	
	.open_data = d_open_data,
	.add_data = d_add_data,
	.close_data = d_close_data,
};

int main(int argc, char * argv[])
{

	for (int i = 1; i < argc; i++) {
		printf("### parsing '%s'\n", argv[i]);
		json_parse(&driver, argv[i]);printf("\n");
	}
	return 0;
}

#endif

_______________________________________________
ragel-users mailing list
ragel-users at complang.org
http://www.complang.org/mailman/listinfo/ragel-users



More information about the ragel-users mailing list