[ragel-users] ragel and memory usage

Adrian Thurston thurs... at cs.queensu.ca
Sat Jan 20 22:54:34 UTC 2007

Hi Damir,

There is likely an ambiguity in there which causes a state explosion. But
since the grammar is so large, trying to track it down is probably not
worth anybody's time. Instead I would suggest starting from scratch and
taking an incremental approach. Start with a simple superset of the
language. For example just separate the request line from the headers and
the message body. Be liberal with actions. Verify that it works properly,
then slowly refine the grammar. If you take this approach and test
frequently you'll easily discover where the ambiguities are because
suddenly things will go way wrong.


> Hi all,
> I'm trying to build a grammar for parsing SIP messages (as in rfc3261),
> but I have encountered a problem that I cannot solve - it seems that
> I've hit a state explosion, since ragel just keeps allocating memory
> and finally dies.
> Here's my grammar (sorry for pasting it here like this)
> --cut--
> %%{
>   machine sip_parser;
>   action mark
>   {
>   }
>   action req_i
>   {
>   }
>   action req_a
>   {
>   }
>   action req_o
>   {
>   }
>   action req_b
>   {
>   }
>   action req_c
>   {
>   }
>   action req_r
>   {
>   }
>   action create_req
>   {
>   }
>   action request_method
>   {
>   }
>   action uri_host
>   {
>   }
>   action done
>   {
>     fbreak;
>   }
>   action uri_port
>   {
>   }
>   action uri_pass
>   {
>   }
>   action uri_user
>   {
>   }
>   action uri_scheme
>   {
>   }
>   action set_req_uri
>   {
>   }
>   action add_param
>   {
>   }
>   action add_param_value
>   {
>   }
>   action uri_query
>   {
>   }
>   CRLF = "\r\n";
>   SP = " ";
>   HTAB = "\t";
>   DQUOTE = "\"";
>   LHEX = digit | [a-f];
>   alphanum = [a-zA-Z0-9];
>   reserved = (";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" |
> ",");
>   mark = ("-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")");
>   unreserved = (alphanum | mark);
>   escaped = ("%" xdigit xdigit);
>   LWS = ((" "* CRLF)? " "+);
>   SWS = (LWS?);
>   HCOLON = ((" " | "\t")* ":" SWS);
>   token = (alphanum | "-" | "." | "!" | "%" | "*" | "_" | "+" | "`" |
> "'" | "~" )+;
>   separators = ("(" | ")" | "<" | ">" | "@" | "," | ";" | ":" | "\\" |
> DQUOTE | "/" | "[" | "]" | "?" | "=" | "{" | "}" | SP | HTAB);
>   word = (alphanum | "-" | "." | "!" | "%" | "*" | "_" | "+" | "`" |
> "'" | "~" | "(" | ")" | "<" | ">" | ":" | "\\" | DQUOTE | "/" | "[" |
> "]" | "?" | "{" | "}")+;
>   STAR   = (SWS "*" SWS);
>   SLASH  = (SWS "/" SWS);
>   EQUAL  = (SWS "=" SWS);
>   LPAREN = (SWS "(" SWS);
>   RPAREN = (SWS ")" SWS);
>   RAQUOT = (">" SWS);
>   LAQUOT = (SWS "<");
>   COMMA  = (SWS "," SWS);
>   SEMI   = (SWS ";" SWS);
>   COLON  = (SWS ":" SWS);
>   qdtext = (LWS | 0x21 | 0x23..0x5B | 0x5D..0x7E);
>   quoted_pair = ("\\" (0x00..0x09 | 0x0B..0x0C | 0x0E..0x7F));
>   quoted_string = (SWS DQUOTE (qdtext | quoted_pair)* DQUOTE);
>   ctext    = (0x21..27 | 0x2A..0x5B | 0x5D..0x7E | LWS);
>   comment_r = (LPAREN? (ctext | quoted_pair) RPAREN?);
>   comment  = (LPAREN (comment_r)* RPAREN);
>   port = digit+;
>   IPv4address =  (digit{1,3} "." digit{1,3} "." digit{1,3} "."
> digit{1,3});
>   toplabel = (alpha | (alpha (alphanum | "-")* alphanum));
>   domainlabel = (alphanum | (alphanum (alphanum | "-")* alphanum));
>   hostname = ((domainlabel ".")* toplabel "."?);
>   host = (hostname | IPv4address);
>   hostport = (host >mark %uri_host (":" port >mark %uri_port)?);
>   password = ((unreserved | escaped | "&" | "=" | "+" | "$" | "," )*);
>   user_unreserved = ("&" | "=" | "+" | "$" | "," | ";" | "?" | "/");
>   user = ((unreserved | escaped | user_unreserved)+);
>   userinfo = (user >mark %uri_user (":" password >mark %uri_pass)?
> "@");
>   extension_method = token;
>   Method = (("INVITE" %req_i | "ACK" %req_a | "OPTIONS" %req_o | "BYE"
> %req_b | "CANCEL" %req_c | "REGISTER" %req_r | extension_method)) >mark
> %request_method;
>   hnv_unreserved = ("[" | "]" | "|" | "?" | ":" | "+" | "$");
>   hvalue = ((hnv_unreserved | unreserved | escaped)*);
>   hname = ((hnv_unreserved | unreserved | escaped)+);
>   header = (hname "=" hvalue);
>   headers = ("?" header ("&" header)*);
>   param_unreserved = ("[" | "]" | "/" | ":" | "&" | "+" | "$");
>   paramchar = (param_unreserved | unreserved | escaped);
>   pvalue = (paramchar+);
>   pname = (paramchar+);
>   other_param = (pname >mark %add_param ("=" pvalue >mark
> %add_param_value)?);
>   lr_param = "lr" >mark %add_param;
>   maddr_param = "maddr" >mark %add_param "=" host >mark
> %add_param_value;
>   method_param = "method" >mark %add_param "=" Method >mark
> %add_param_value;
>   ttl = (digit{1,3});
>   ttl_param = "ttl" >mark %add_param "=" ttl >mark %add_param_value;
>   other_user = token;
>   user_param = "user" >mark %add_param "=" ("phone" | "ip" |
> other_user) >mark %add_param_value;
>   other_transport = token;
>   transport_param = ("transport" >mark %add_param "=" ("udp" | "tcp" |
> "sctp" | "tls" | other_transport) >mark %add_param_value);
>   uri_parameter = (transport_param | user_param | method_param |
> ttl_param | maddr_param | lr_param | other_param);
>   uri_parameters = ((";" uri_parameter)*);
>   SIP_URI = ("sip:" %{ m_data->m_uri.scheme() = "sip"; } (userinfo)?
> hostport uri_parameters (headers)?);
>   SIPS_URI = ("sips:" %{ m_data->m_uri.scheme() = "sips"; } (userinfo)?
> hostport uri_parameters (headers)?);
>   x_token = ("x-" token);
>   ietf_token = token;
>   extension_token = (ietf_token | x_token);
>   composite_type = ("message" | "multipart" | extension_token);
>   discrete_type = ("text" | "image" | "audio" | "video" | "application"
> | extension_token);
>   m_type = (discrete_type | composite_type);
>   m_value = (token | quoted_string);
>   m_attribute = token;
>   m_parameter = (m_attribute EQUAL m_value);
>   iana_token = token;
>   m_subtype = (extension_token | iana_token);
>   uric           = (reserved | unreserved | escaped);
>   query          = (uric*);
>   reg_name       = (unreserved | escaped | "$" | "," | ";" | ":" | "@"
> | "&" | "=" | "+" )+;
>   srvr           = ((userinfo "@")? hostport)?;
>   authority      = (srvr | reg_name);
>   scheme         = (alpha (alpha | digit | "+" | "-" | "." )*);
>   pchar          = (unreserved | escaped | ":" | "@" | "&" | "=" | "+"
> | "$" | ",");
>   param          = (pchar*);
>   segment        = (pchar* (";" param)*);
>   path_segments  = (segment ("/" segment)*);
>   uric_no_slash  = (unreserved | escaped | ";" | "?" | ":" | "@" | "&"
> | "=" | "+" | "$" | ",");
>   opaque_part    = (uric_no_slash uric*);
>   abs_path       = ("/" path_segments);
>   net_path       = ("//" authority (abs_path)?);
>   hier_part      = ((net_path | abs_path) ("?" query >mark
> %uri_query)?);
>   absoluteURI    = (scheme >mark %uri_scheme ":" (hier_part |
> opaque_part));
>   gen_value = (token | host | quoted_string);
>   generic_param = (token (EQUAL gen_value)?);
>   qvalue = (("0" ("." digit{,3})?) | ("1" ("." ("0"){,3})?));
>   accept_param = (("q" EQUAL qvalue) | generic_param);
>   media_range = (("*" "/" "*" | ( m_type SLASH "*" ) | ( m_type SLASH
> m_subtype )) (SEMI m_parameter)*);
>   accept_range = (media_range (SEMI accept_param)*);
>   Accept = ("Accept" HCOLON (accept_range (COMMA accept_range)*))?;
>   content_coding   = (token);
>   codings          = (content_coding | "*");
>   encoding         = (codings (SEMI accept_param)*);
>   Accept_Encoding  = ("Accept-Encoding" HCOLON (encoding (COMMA
> encoding)*)?);
>   language_range   = (((alpha{1,8} ("-" alpha{1,8})*) | "*" ));
>   language         = (language_range (SEMI accept_param)*);
>   Accept_Language  = ("Accept-Language" HCOLON (language (COMMA
> language)*)?);
>   alert_param      = (LAQUOT absoluteURI RAQUOT (SEMI generic_param
> )*);
>   Alert_Info       = ("Alert-Info" HCOLON alert_param (COMMA
> alert_param)*);
>   Allow            = ("Allow" HCOLON (Method (COMMA Method)*)?);
>   opaque           = ("opaque" EQUAL quoted_string);
>   algorithm        = ("algorithm" EQUAL ("MD5" | "MD5-sess" | token));
>   realm_value      = (quoted_string);
>   realm            = ("realm" EQUAL realm_value);
>   auth_scheme      = (token);
>   auth_param_name  = (token);
>   auth_param       = (auth_param_name EQUAL (token | quoted_string));
>   other_response   = (auth_scheme LWS auth_param (COMMA auth_param)*);
>   request_digest   = (LDQUOT LHEX{32} RDQUOT);
>   dresponse        = ("response" EQUAL request_digest);
>   nonce_value      = (quoted_string);
>   nonce            = ("nonce" EQUAL nonce_value);
>   nc_value         = (LHEX{8});
>   nonce_count      = ("nc" EQUAL nc_value);
>   cnonce_value     = (nonce_value);
>   cnonce           = ("cnonce" EQUAL cnonce_value);
>   qop_value        = ("auth" | "auth-int" | token);
>   message_qop      = ("qop" EQUAL qop_value);
>   digest_uri_value = (absoluteURI); # fixme
>   digest_uri       = ("uri" EQUAL LDQUOT digest_uri_value RDQUOT);
>   username_value   = (quoted_string);
>   username         = ("username" EQUAL username_value);
>   dig_resp         = (username | realm | nonce | digest_uri | dresponse
> | algorithm | cnonce | opaque | message_qop | nonce_count |
> auth_param);
>   digest_response  = (dig_resp (COMMA dig_resp)*);
>   credentials      = (("Digest" LWS digest_response) | other_response);
>   Authorization    = ("Authorization" HCOLON credentials);
>   response_digest      = (LDQUOT LHEX* RDQUOT);
>   response_auth        = ("rspauth" EQUAL response_digest);
>   nextnonce            = ("nextnonce" EQUAL nonce_value);
>   ainfo                = (nextnonce | message_qop | response_auth |
> cnonce | nonce_count);
>   Authentication_Info  = ("Authentication-Info" HCOLON ainfo (COMMA
> ainfo)*);
>   callid   =  (word ("@" word)?);
>   Call_ID  = (( "Call-ID" | "i" ) HCOLON callid);
>   info_param  = (("purpose" EQUAL ("icon" | "info" | "card" | token)) |
> generic_param);
>   info        = (LAQUOT absoluteURI RAQUOT (SEMI info_param)*);
>   Call_Info   = ("Call-Info" HCOLON info (COMMA info)*);
>   delta_seconds      = (digit+);
>   contact_extension  = (generic_param);
>   c_p_expires    = ("expires" EQUAL delta_seconds);
>   c_p_q          = ("q" EQUAL qvalue);
>   contact_params = (c_p_q | c_p_expires);
>   display_name   = ((token LWS)* | quoted_string);
>   addr_spec      = (SIP_URI | SIPS_URI | absoluteURI);
>   name_addr      = ((display_name)? LAQUOT addr_spec RAQUOT);
>   contact_param  = ((name_addr | addr_spec) (SEMI contact_params)*);
>   Contact     = (("Contact" | "m" ) HCOLON (STAR | (contact_param
> (COMMA contact_param)*)));
>   disp_extension_token  = (token);
>   other_handling        = (token);
>   handling_param        = ("handling" EQUAL ( "optional" | "required" |
> other_handling));
>   disp_param            = (handling_param | generic_param);
>   disp_type             = ("render" | "session" | "icon" | "alert" |
> disp_extension_token);
>   Content_Disposition   = ("Content-Disposition" HCOLON disp_type (SEMI
> disp_param)*);
>   Content_Encoding  = (("Content-Encoding" | "e" ) HCOLON
> content_coding (COMMA content_coding)*);
>   subtag            = (alpha{1,8});
>   primary_tag       = (alpha{1,8});
>   language_tag      = (primary_tag ("-" subtag)*);
>   Content_Language  = ("Content-Language" HCOLON language_tag (COMMA
> language_tag)*);
>   Content_Length    = (("Content-Length" | "l") HCOLON digit+);
>   media_type        = (m_type SLASH m_subtype (SEMI m_parameter)*);
>   Content_Type      = (("Content-Type" | "c" ) HCOLON media_type);
>   CSeq = ("CSeq" HCOLON digit+ LWS Method);
>   message_header = (Accept |
>                     Accept_Encoding |
>                     Accept_Language |
>                     Alert_Info |
>                     Allow |
>                     Authentication_Info |
>                     Authorization |
>                     Call_ID |
>                     Call_Info |
>                     Contact |
>                     Content_Disposition |
>                     Content_Encoding |
>                     Content_Language |
>                     Content_Length |
>                     Content_Type |
>                     CSeq) CRLF;
>   SIP_Version    = "SIP" "/" digit "." digit;
>   Request_URI    = SIP_URI | SIPS_URI | absoluteURI;
>   Request_Line   = Method %create_req SP Request_URI %set_req_uri SP
> SIP_Version CRLF;
>   Request        = Request_Line (message_header)* CRLF;
> main := Request;
> }%%
> %% write data;
> --cut--
> I think that problems is somewhere around this line:
> hier_part      = ((net_path | abs_path) ("?" query >mark %uri_query)?);
> if I remove the ">mark %uri_query" part, ragel is able to complete the
> operation.
> Can anyone give me some clues what's gone bad?
> tia,
> d
> >

