Commit bbb79f6f authored by Taddeüs Kroes's avatar Taddeüs Kroes

Rewrote lexer

parent 6ee79d6b
{
open Lexing
open Parser
(* Tokenizer according to definition at
* http://www.w3.org/TR/CSS2/syndata.html#tokenization *)
open Lexing
open Parser
exception SyntaxError of string
exception SyntaxError of string
let next_line lexbuf =
let next_line lexbuf =
let pos = lexbuf.lex_curr_p in
lexbuf.lex_curr_p <- {
pos with pos_bol = lexbuf.lex_curr_pos;
......@@ -12,50 +14,94 @@ let next_line lexbuf =
}
}
let h = ['0'-'9''a'-'f']
let wc = '\r''\n' | [' ''\t''\r''\n''\012']
let nonascii = ['\160'-'\255']
let s = [' ''\t''\r''\n''\012']+
let w = s?
let nl = '\n' | '\r''\n' | '\r' | '\012'
let unicode = '\\' h(h(h(h(h(h)?)?)?)?)? wc?
let escape = unicode | '\\'[^'\r''\n''\012''0'-'9''a'-'f']
let nmstart = ['_''a'-'z'] | nonascii | escape
let nmchar = ['_''a'-'z''0'-'9''-'] | nonascii | escape
let string1 = '"'([^'\n''\r''\012''"'] | '\\'nl | escape)*'"'
let string2 = '\''([^'\n''\r''\012''\''] | '\\'nl | escape)*'\''
let mystring = string1 | string2
let badstring1 = '"'([^'\n''\r''\012''"'] | '\\'nl | escape)*'\\'?
let badstring2 = '\''([^'\n''\r''\012''\''] | '\\'nl | escape)*'\\'?
let badstring = badstring1 | badstring2
let badcomment1 = '/''*'[^'*']*'*'+([^'/''*'][^'*']*'*'+)*
let badcomment2 = '/''*'[^'*']*('*'+[^'/''*'][^'*']*)*
let badcomment = badcomment1 | badcomment2
let baduri1 = "url("w(['!''#''$''%''&''*'-'['']'-'~'] | nonascii | escape)*w
let baduri2 = "url("w mystring w
let baduri3 = "url("w badstring
let baduri = baduri1 | baduri2 | baduri3
let comment = '/''*'[^'*']*'*'+([^'/''*'][^'*']*'*'+)'*''/'
let ident = '-'? nmstart nmchar*
let name = nmchar+
let num = ['0'-'9']+ | ['0'-'9']*'.'['0'-'9']+
let url = (['!''#''$''%''&''*''-''~'] | nonascii | escape)*
rule token = parse
| s { S }
| comment (* ignore comments *)
| badcomment (* unclosed comment at EOF *)
| "<!--" { CDO }
| "-->" { CDC }
| "~=" { INCLUDES }
| "|=" { DASHMATCH }
| mystring { STRING }
| badstring { BAD_STRING }
| ident as id { IDENT id }
| '#' (name as name) { HASH name }
| "@import" { IMPORT_SYM }
| "@page" { PAGE_SYM }
| "@media" { MEDIA_SYM }
| "@charset" { CHARSET_SYM }
| '!' (w | comment)* "important" { IMPORTANT_SYM }
| (num as n) "em" { EMS (int_of_string n) }
| (num as n) "ex" { EXS (int_of_string n) }
| (num as n) "px" { LENGTH (int_of_string n, "px") }
| (num as n) "cm" { LENGTH (int_of_string n, "cm") }
| (num as n) "mm" { LENGTH (int_of_string n, "mm") }
| (num as n) "in" { LENGTH (int_of_string n, "in") }
| (num as n) "pt" { LENGTH (int_of_string n, "pt") }
| (num as n) "pc" { LENGTH (int_of_string n, "pc") }
| (num as n) "deg" { ANGLE (int_of_string n, "deg") }
| (num as n) "rad" { ANGLE (int_of_string n, "rad") }
| (num as n) "grad" { ANGLE (int_of_string n, "grad") }
| (num as n) "ms" { TIME (int_of_string n, "ms") }
| (num as n) "s" { TIME (int_of_string n, "s") }
| (num as n) "hz" { FREQ (int_of_string n, "hz") }
| (num as n) "khz" { FREQ (int_of_string n, "khz") }
| (num as n) "%" { PERCENTAGE (int_of_string n) }
| (num as n) (ident as dim) { DIMENSION (int_of_string n, dim) }
| num as n { NUMBER (int_of_string n) }
| "url(" w (mystring as uri) w ")" { URI uri }
| "url(" w (url as uri) w ")" { URI uri }
| baduri as uri { BAD_URI uri }
| (ident as fn) '(' { FUNCTION fn }
| '(' { LPAREN }
| ')' { RPAREN }
| '{' { LBRACE }
| '}' { RBRACE }
| '[' { LBRACK }
| ']' { RBRACK }
| ';' { SEMICOL }
| ',' { COMMA }
| ':' { COLON }
| "@media" { MEDIA }
| "@import" { IMPORT }
| "@charset" { CHARSET }
| "@page" { PAGE }
| "@font-face" { FONTFACE }
| "@namespace" { NAMESPACE }
| "!important" { IMPORTANT }
| ['A'-'Z''a'-'z''0'-'9''_''-''#''.']+ as id { ID id }
| ['.''#'':']['A'-'Z''a'-'z''_''-']['A'-'Z''a'-'z''0'-'9''_''-''.''#'':']* as id { SELECTOR id }
| '\r' | '\n' | "\r\n" { next_line lexbuf; token lexbuf }
| [' ''\t']+ { token lexbuf }
| "/*" { comment lexbuf }
| '"' { str (Buffer.create 17) lexbuf }
| eof | '\000' { EOF }
| _ as chr { raise (SyntaxError ("unexpected char: " ^ Char.escaped chr)) }
(* Multi-line comments *)
and comment = parse
| '\r' | '\n' | "\r\n" { next_line lexbuf; comment lexbuf }
| "*/" { token lexbuf }
| _ { comment lexbuf }
(* Strings *)
and str buf = parse
| '"' { STRING (Buffer.contents buf) }
| '\\''/' { Buffer.add_char buf '/'; str buf lexbuf }
| '\\''\\' { Buffer.add_char buf '\\'; str buf lexbuf }
| '\\''b' { Buffer.add_char buf '\b'; str buf lexbuf }
| '\\''f' { Buffer.add_char buf '\012'; str buf lexbuf }
| '\\''n' { Buffer.add_char buf '\n'; str buf lexbuf }
| '\\''r' { Buffer.add_char buf '\r'; str buf lexbuf }
| '\\''t' { Buffer.add_char buf '\t'; str buf lexbuf }
| [^'"''\\']+ as s { Buffer.add_string buf s; str buf lexbuf }
| eof { raise (SyntaxError "unterminated string") }
(*
| _ as c { raise (SyntaxError ("illegal string character: " ^ Char.escaped c)) }
*)
......@@ -6,10 +6,12 @@ let prop2str (name, value) = name ^ ":" ^ Stringify.value2str value
%}
(* Tokens *)
%token LPAREN RPAREN LBRACE RBRACE SEMICOL COMMA COLON
%token MEDIA IMPORT CHARSET PAGE FONTFACE NAMESPACE
%token IMPORTANT EOF
%token <string> ID STRING SELECTOR
%token S CDO CDC INCLUDES DASHMATCH STRING BAD_STRING IMPORT_SYM PAGE_SYM
%token MEDIA_SYM CHARSET_SYM IMPORTANT_SYM
%token LPAREN RPAREN LBRACE RBRACE LBRACK RBRACK SEMICOL COLON
%token <int> EMS EXS PERCENTAGE NUMBER
%token <int * string> LENGTH ANGLE TIME FREQ DIMENSION
%token <string> IDENT HASH URI BAD_URI FUNCTION
(* Start symbol *)
%type <Types.decl list> stylesheet
......@@ -18,68 +20,43 @@ let prop2str (name, value) = name ^ ":" ^ Stringify.value2str value
%%
(* Left-recursive list (use List.rev to obtain correctly ordered list) *)
(*
llist(x):
| { [] }
| tl=llist(x) hd=x { hd :: tl }
separated_llist(sep, x):
| { [] }
| tl=llist(x) sep hd=x { hd :: tl }
*)
stylesheet:
| decls=llist(decl) EOF
{ List.rev decls }
selector:
| id=ID { [id] }
| id=SELECTOR { [id] }
| tl=selector hd=ID { hd :: tl }
| tl=selector hd=SELECTOR { hd :: tl }
value:
| str=STRING { Str str }
| lit=ID { Lit lit }
| name=ID LPAREN arg=value RPAREN { Fn (name, arg) }
| IMPORTANT { Imp }
| ( CDO | CDC | S | statement )*
prop:
| name=ID COLON v=value+
{ (name, match v with [hd] -> hd | _ -> Lst v) }
statement:
| ruleset
| at_rule
propline:
| p=prop SEMICOL
{ p }
at_rule:
| ATKEYWORD S* any* ( block | SEMICOL S* )
props:
| LBRACE p=llist(propline) last=prop? RBRACE
{ List.rev p @ (match last with None -> [] | Some p -> [p]) }
block:
| LBRACE S* ( any | block | ATKEYWORD S* | SEMICOL S* )* RBRACE S*
group:
| s=separated_nonempty_list(COMMA, selector) p=props
{ Group (List.rev s, p) }
ruleset:
| selectors=any+ LBRACE S* declaration? ( SEMICOL S* declaration? )* RBRACE S*
%inline media:
| m=ID
{ m }
| LPAREN p=prop RPAREN
{ "(" ^ prop2str p ^ ")" }
declaration:
| name=IDENT S* COLON S* value=value
{ Property (name, value) }
%inline stringopt: f=STRING | f=ID { f }
decl:
| g=group
{ g }
| MEDIA queries=separated_nonempty_list(COMMA, media) LBRACE groups=llist(group) RBRACE
{ Media (queries, List.rev groups) }
| IMPORT f=stringopt q=separated_list(COMMA, ID) SEMICOL
{ Import (f, q) }
| CHARSET c=stringopt SEMICOL
{ Charset c }
| PAGE query=ID? p=props
{ Page (query, p) }
| FONTFACE p=props
{ Fontface p }
| NAMESPACE prefix=ID? uri=STRING SEMICOL
{ Namespace (prefix, uri) }
%%
value:
| ( any | block | ATKEYWORD S* )+
any:
| ( IDENT | NUMBER | PERCENTAGE | DIMENSION | STRING | DELIM | URI | HASH |
UNICODE-RANGE | INCLUDES | DASHMATCH | COLON | FUNCTION S* (any|unused)*
RPAREN | LPAREN S* (any|unused)* RPAREN | LBRACK S* (any|unused)* RBRACK) S*
unused:
| block
| ATKEYWORD S*
| SEMICOL S*
| CDO S*
| CDC S*
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment