Rewrote lexer

bbb79f6f · Taddeüs Kroes · 6ee79d6b · bbb79f6f · bbb79f6f
Commit bbb79f6f authored 10 years ago by Taddeüs Kroes
--- a/lexer.mll
+++ b/lexer.mll
 {
-open Lexing
-open Parser
+  (* Tokenizer according to definition at
+   * http://www.w3.org/TR/CSS2/syndata.html#tokenization *)
+  open Lexing
+  open Parser

-exception SyntaxError of string
+  exception SyntaxError of string

-let next_line lexbuf =
-  let pos = lexbuf.lex_curr_p in
-  lexbuf.lex_curr_p <- {
-    pos with pos_bol = lexbuf.lex_curr_pos;
-             pos_lnum = pos.pos_lnum + 1
-  }
+  let next_line lexbuf =
+    let pos = lexbuf.lex_curr_p in
+    lexbuf.lex_curr_p <- {
+      pos with pos_bol = lexbuf.lex_curr_pos;
+              pos_lnum = pos.pos_lnum + 1
+    }
 }

+let h           = ['0'-'9''a'-'f']
+let wc          = '\r''\n' | [' ''\t''\r''\n''\012']
+let nonascii    = ['\160'-'\255']
+let s           = [' ''\t''\r''\n''\012']+
+let w           = s?
+let nl          = '\n' | '\r''\n' | '\r' | '\012'
+let unicode     = '\\' h(h(h(h(h(h)?)?)?)?)? wc?
+let escape      = unicode | '\\'[^'\r''\n''\012''0'-'9''a'-'f']
+let nmstart     = ['_''a'-'z'] | nonascii | escape
+let nmchar      = ['_''a'-'z''0'-'9''-'] | nonascii | escape
+let string1     = '"'([^'\n''\r''\012''"'] | '\\'nl | escape)*'"'
+let string2     = '\''([^'\n''\r''\012''\''] | '\\'nl | escape)*'\''
+let mystring    = string1 | string2
+let badstring1  = '"'([^'\n''\r''\012''"'] | '\\'nl | escape)*'\\'?
+let badstring2  = '\''([^'\n''\r''\012''\''] | '\\'nl | escape)*'\\'?
+let badstring   = badstring1 | badstring2
+let badcomment1 = '/''*'[^'*']*'*'+([^'/''*'][^'*']*'*'+)*
+let badcomment2 = '/''*'[^'*']*('*'+[^'/''*'][^'*']*)*
+let badcomment  = badcomment1 | badcomment2
+let baduri1     = "url("w(['!''#''$''%''&''*'-'['']'-'~'] | nonascii | escape)*w
+let baduri2     = "url("w mystring w
+let baduri3     = "url("w badstring
+let baduri      = baduri1 | baduri2 | baduri3
+let comment     = '/''*'[^'*']*'*'+([^'/''*'][^'*']*'*'+)'*''/'
+let ident       = '-'? nmstart nmchar*
+let name        = nmchar+
+let num         = ['0'-'9']+ | ['0'-'9']*'.'['0'-'9']+
+let url         = (['!''#''$''%''&''*''-''~'] | nonascii | escape)*
+
 rule token = parse
-  | '(' { LPAREN }
-  | ')' { RPAREN }
-  | '{' { LBRACE }
-  | '}' { RBRACE }
-  | ';' { SEMICOL }
-  | ',' { COMMA }
-  | ':' { COLON }
-
-  | "@media"     { MEDIA }
-  | "@import"    { IMPORT }
-  | "@charset"   { CHARSET }
-  | "@page"      { PAGE }
-  | "@font-face" { FONTFACE }
-  | "@namespace" { NAMESPACE }
-
-  | "!important" { IMPORTANT }
-  | ['A'-'Z''a'-'z''0'-'9''_''-''#''.']+ as id { ID id }
-  | ['.''#'':']['A'-'Z''a'-'z''_''-']['A'-'Z''a'-'z''0'-'9''_''-''.''#'':']* as id { SELECTOR id }
-
-  | '\r' | '\n' | "\r\n" { next_line lexbuf; token lexbuf }
-  | [' ''\t']+           { token lexbuf }
-  | "/*"                 { comment lexbuf }
-  | '"'                  { str (Buffer.create 17) lexbuf }
-  | eof | '\000'         { EOF }
-
-  | _ as chr { raise (SyntaxError ("unexpected char: " ^ Char.escaped chr)) }
-
-(* Multi-line comments *)
-and comment = parse
-  | '\r' | '\n' | "\r\n"  { next_line lexbuf; comment lexbuf }
-  | "*/"                  { token lexbuf }
-  | _                     { comment lexbuf }
-
-(* Strings *)
-and str buf = parse
-  | '"'              { STRING (Buffer.contents buf) }
-  | '\\''/'          { Buffer.add_char buf '/';    str buf lexbuf }
-  | '\\''\\'         { Buffer.add_char buf '\\';   str buf lexbuf }
-  | '\\''b'          { Buffer.add_char buf '\b';   str buf lexbuf }
-  | '\\''f'          { Buffer.add_char buf '\012'; str buf lexbuf }
-  | '\\''n'          { Buffer.add_char buf '\n';   str buf lexbuf }
-  | '\\''r'          { Buffer.add_char buf '\r';   str buf lexbuf }
-  | '\\''t'          { Buffer.add_char buf '\t';   str buf lexbuf }
-  | [^'"''\\']+ as s { Buffer.add_string buf s;    str buf lexbuf }
-  | eof              { raise (SyntaxError "unterminated string") }
+  | s                   { S }
+
+  | comment             (* ignore comments *)
+  | badcomment          (* unclosed comment at EOF *)
+
+  | "<!--"              { CDO }
+  | "-->"               { CDC }
+  | "~="                { INCLUDES }
+  | "|="                { DASHMATCH }
+
+  | mystring            { STRING }
+  | badstring           { BAD_STRING }
+
+  | ident as id         { IDENT id }
+
+  | '#' (name as name)  { HASH name }
+
+  | "@import"           { IMPORT_SYM }
+  | "@page"             { PAGE_SYM }
+  | "@media"            { MEDIA_SYM }
+  | "@charset"          { CHARSET_SYM }
+
+  | '!' (w | comment)* "important"  { IMPORTANT_SYM }
+
+  | (num as n) "em"     { EMS (int_of_string n) }
+  | (num as n) "ex"     { EXS (int_of_string n) }
+  | (num as n) "px"     { LENGTH (int_of_string n, "px") }
+  | (num as n) "cm"     { LENGTH (int_of_string n, "cm") }
+  | (num as n) "mm"     { LENGTH (int_of_string n, "mm") }
+  | (num as n) "in"     { LENGTH (int_of_string n, "in") }
+  | (num as n) "pt"     { LENGTH (int_of_string n, "pt") }
+  | (num as n) "pc"     { LENGTH (int_of_string n, "pc") }
+  | (num as n) "deg"    { ANGLE (int_of_string n, "deg") }
+  | (num as n) "rad"    { ANGLE (int_of_string n, "rad") }
+  | (num as n) "grad"   { ANGLE (int_of_string n, "grad") }
+  | (num as n) "ms"     { TIME (int_of_string n, "ms") }
+  | (num as n) "s"      { TIME (int_of_string n, "s") }
+  | (num as n) "hz"     { FREQ (int_of_string n, "hz") }
+  | (num as n) "khz"    { FREQ (int_of_string n, "khz") }
+  | (num as n) "%"      { PERCENTAGE (int_of_string n) }
+  | (num as n) (ident as dim)  { DIMENSION (int_of_string n, dim) }
+  | num as n            { NUMBER (int_of_string n) }
+
+  | "url(" w (mystring as uri) w ")"  { URI uri }
+  | "url(" w (url as uri) w ")"       { URI uri }
+  | baduri as uri                     { BAD_URI uri }
+
+  | (ident as fn) '('   { FUNCTION fn }
+
+  | '('                 { LPAREN }
+  | ')'                 { RPAREN }
+  | '{'                 { LBRACE }
+  | '}'                 { RBRACE }
+  | '['                 { LBRACK }
+  | ']'                 { RBRACK }
+  | ';'                 { SEMICOL }
+  | ':'                 { COLON }
+
+  (*
  | _ as c { raise (SyntaxError ("illegal string character: " ^ Char.escaped c)) }
+  *)
--- a/parser.mly
+++ b/parser.mly
@@ -6,10 +6,12 @@ let prop2str (name, value) = name ^ ":" ^ Stringify.value2str value
 %}

 (* Tokens *)
-%token LPAREN RPAREN LBRACE RBRACE SEMICOL COMMA COLON
-%token MEDIA IMPORT CHARSET PAGE FONTFACE NAMESPACE
-%token IMPORTANT EOF
-%token <string> ID STRING SELECTOR
+%token S CDO CDC INCLUDES DASHMATCH STRING BAD_STRING IMPORT_SYM PAGE_SYM
+%token MEDIA_SYM CHARSET_SYM IMPORTANT_SYM
+%token LPAREN RPAREN LBRACE RBRACE LBRACK RBRACK SEMICOL COLON
+%token <int> EMS EXS PERCENTAGE NUMBER
+%token <int * string> LENGTH ANGLE TIME FREQ DIMENSION
+%token <string> IDENT HASH URI BAD_URI FUNCTION

 (* Start symbol *)
 %type <Types.decl list> stylesheet
@@ -18,68 +20,43 @@ let prop2str (name, value) = name ^ ":" ^ Stringify.value2str value
 %%

 (* Left-recursive list (use List.rev to obtain correctly ordered list) *)
+(*
 llist(x):
  |            { [] }
  | tl=llist(x) hd=x { hd :: tl }
-
-separated_llist(sep, x):
-  |                      { [] }
-  | tl=llist(x) sep hd=x { hd :: tl }
+*)

 stylesheet:
-  | decls=llist(decl) EOF
-  { List.rev decls }
-
-selector:
-  | id=ID                    { [id] }
-  | id=SELECTOR              { [id] }
-  | tl=selector hd=ID        { hd :: tl }
-  | tl=selector hd=SELECTOR  { hd :: tl }
-
-value:
-  | str=STRING                      { Str str }
-  | lit=ID                          { Lit lit }
-  | name=ID LPAREN arg=value RPAREN { Fn (name, arg) }
-  | IMPORTANT                       { Imp }
+  | ( CDO | CDC | S | statement )*

-prop:
-  | name=ID COLON v=value+
-  { (name, match v with [hd] -> hd | _ -> Lst v) }
+statement:
+  | ruleset
+  | at_rule

-propline:
-  | p=prop SEMICOL
-  { p }
+at_rule:
+  | ATKEYWORD S* any* ( block | SEMICOL S* )

-props:
-  | LBRACE p=llist(propline) last=prop? RBRACE
-  { List.rev p @ (match last with None -> [] | Some p -> [p]) }
+block:
+  | LBRACE S* ( any | block | ATKEYWORD S* | SEMICOL S* )* RBRACE S*

-group:
-  | s=separated_nonempty_list(COMMA, selector) p=props
-  { Group (List.rev s, p) }
+ruleset:
+  | selectors=any+ LBRACE S* declaration? ( SEMICOL S* declaration? )* RBRACE S*

-%inline media:
-  | m=ID
-  { m }
-  | LPAREN p=prop RPAREN
-  { "(" ^ prop2str p ^ ")" }
+declaration:
+  | name=IDENT S* COLON S* value=value
+  { Property (name, value) }

-%inline stringopt: f=STRING | f=ID { f }
-
-decl:
-  | g=group
-  { g }
-  | MEDIA queries=separated_nonempty_list(COMMA, media) LBRACE groups=llist(group) RBRACE
-  { Media (queries, List.rev groups) }
-  | IMPORT f=stringopt q=separated_list(COMMA, ID) SEMICOL
-  { Import (f, q) }
-  | CHARSET c=stringopt SEMICOL
-  { Charset c }
-  | PAGE query=ID? p=props
-  { Page (query, p) }
-  | FONTFACE p=props
-  { Fontface p }
-  | NAMESPACE prefix=ID? uri=STRING SEMICOL
-  { Namespace (prefix, uri) }
-
-%%
+value:
+  | ( any | block | ATKEYWORD S* )+
+
+any:
+  | ( IDENT | NUMBER | PERCENTAGE | DIMENSION | STRING | DELIM | URI | HASH |
+  UNICODE-RANGE | INCLUDES | DASHMATCH | COLON | FUNCTION S* (any|unused)*
+  RPAREN | LPAREN S* (any|unused)* RPAREN | LBRACK S* (any|unused)* RBRACK) S*
+
+unused:
+  | block
+  | ATKEYWORD S*
+  | SEMICOL S*
+  | CDO S*
+  | CDC S*