Skip to content
Snippets Groups Projects
Commit bbb79f6f authored by Taddeüs Kroes's avatar Taddeüs Kroes
Browse files

Rewrote lexer

parent 6ee79d6b
No related branches found
No related tags found
No related merge requests found
{
open Lexing
open Parser
(* Tokenizer according to definition at
* http://www.w3.org/TR/CSS2/syndata.html#tokenization *)
open Lexing
open Parser
exception SyntaxError of string
exception SyntaxError of string
let next_line lexbuf =
let pos = lexbuf.lex_curr_p in
lexbuf.lex_curr_p <- {
pos with pos_bol = lexbuf.lex_curr_pos;
pos_lnum = pos.pos_lnum + 1
}
let next_line lexbuf =
let pos = lexbuf.lex_curr_p in
lexbuf.lex_curr_p <- {
pos with pos_bol = lexbuf.lex_curr_pos;
pos_lnum = pos.pos_lnum + 1
}
}
let h = ['0'-'9''a'-'f']
let wc = '\r''\n' | [' ''\t''\r''\n''\012']
let nonascii = ['\160'-'\255']
let s = [' ''\t''\r''\n''\012']+
let w = s?
let nl = '\n' | '\r''\n' | '\r' | '\012'
let unicode = '\\' h(h(h(h(h(h)?)?)?)?)? wc?
let escape = unicode | '\\'[^'\r''\n''\012''0'-'9''a'-'f']
let nmstart = ['_''a'-'z'] | nonascii | escape
let nmchar = ['_''a'-'z''0'-'9''-'] | nonascii | escape
let string1 = '"'([^'\n''\r''\012''"'] | '\\'nl | escape)*'"'
let string2 = '\''([^'\n''\r''\012''\''] | '\\'nl | escape)*'\''
let mystring = string1 | string2
let badstring1 = '"'([^'\n''\r''\012''"'] | '\\'nl | escape)*'\\'?
let badstring2 = '\''([^'\n''\r''\012''\''] | '\\'nl | escape)*'\\'?
let badstring = badstring1 | badstring2
let badcomment1 = '/''*'[^'*']*'*'+([^'/''*'][^'*']*'*'+)*
let badcomment2 = '/''*'[^'*']*('*'+[^'/''*'][^'*']*)*
let badcomment = badcomment1 | badcomment2
let baduri1 = "url("w(['!''#''$''%''&''*'-'['']'-'~'] | nonascii | escape)*w
let baduri2 = "url("w mystring w
let baduri3 = "url("w badstring
let baduri = baduri1 | baduri2 | baduri3
let comment = '/''*'[^'*']*'*'+([^'/''*'][^'*']*'*'+)'*''/'
let ident = '-'? nmstart nmchar*
let name = nmchar+
let num = ['0'-'9']+ | ['0'-'9']*'.'['0'-'9']+
let url = (['!''#''$''%''&''*''-''~'] | nonascii | escape)*
rule token = parse
| '(' { LPAREN }
| ')' { RPAREN }
| '{' { LBRACE }
| '}' { RBRACE }
| ';' { SEMICOL }
| ',' { COMMA }
| ':' { COLON }
| "@media" { MEDIA }
| "@import" { IMPORT }
| "@charset" { CHARSET }
| "@page" { PAGE }
| "@font-face" { FONTFACE }
| "@namespace" { NAMESPACE }
| "!important" { IMPORTANT }
| ['A'-'Z''a'-'z''0'-'9''_''-''#''.']+ as id { ID id }
| ['.''#'':']['A'-'Z''a'-'z''_''-']['A'-'Z''a'-'z''0'-'9''_''-''.''#'':']* as id { SELECTOR id }
| '\r' | '\n' | "\r\n" { next_line lexbuf; token lexbuf }
| [' ''\t']+ { token lexbuf }
| "/*" { comment lexbuf }
| '"' { str (Buffer.create 17) lexbuf }
| eof | '\000' { EOF }
| _ as chr { raise (SyntaxError ("unexpected char: " ^ Char.escaped chr)) }
(* Multi-line comments *)
and comment = parse
| '\r' | '\n' | "\r\n" { next_line lexbuf; comment lexbuf }
| "*/" { token lexbuf }
| _ { comment lexbuf }
(* Strings *)
and str buf = parse
| '"' { STRING (Buffer.contents buf) }
| '\\''/' { Buffer.add_char buf '/'; str buf lexbuf }
| '\\''\\' { Buffer.add_char buf '\\'; str buf lexbuf }
| '\\''b' { Buffer.add_char buf '\b'; str buf lexbuf }
| '\\''f' { Buffer.add_char buf '\012'; str buf lexbuf }
| '\\''n' { Buffer.add_char buf '\n'; str buf lexbuf }
| '\\''r' { Buffer.add_char buf '\r'; str buf lexbuf }
| '\\''t' { Buffer.add_char buf '\t'; str buf lexbuf }
| [^'"''\\']+ as s { Buffer.add_string buf s; str buf lexbuf }
| eof { raise (SyntaxError "unterminated string") }
| s { S }
| comment (* ignore comments *)
| badcomment (* unclosed comment at EOF *)
| "<!--" { CDO }
| "-->" { CDC }
| "~=" { INCLUDES }
| "|=" { DASHMATCH }
| mystring { STRING }
| badstring { BAD_STRING }
| ident as id { IDENT id }
| '#' (name as name) { HASH name }
| "@import" { IMPORT_SYM }
| "@page" { PAGE_SYM }
| "@media" { MEDIA_SYM }
| "@charset" { CHARSET_SYM }
| '!' (w | comment)* "important" { IMPORTANT_SYM }
| (num as n) "em" { EMS (int_of_string n) }
| (num as n) "ex" { EXS (int_of_string n) }
| (num as n) "px" { LENGTH (int_of_string n, "px") }
| (num as n) "cm" { LENGTH (int_of_string n, "cm") }
| (num as n) "mm" { LENGTH (int_of_string n, "mm") }
| (num as n) "in" { LENGTH (int_of_string n, "in") }
| (num as n) "pt" { LENGTH (int_of_string n, "pt") }
| (num as n) "pc" { LENGTH (int_of_string n, "pc") }
| (num as n) "deg" { ANGLE (int_of_string n, "deg") }
| (num as n) "rad" { ANGLE (int_of_string n, "rad") }
| (num as n) "grad" { ANGLE (int_of_string n, "grad") }
| (num as n) "ms" { TIME (int_of_string n, "ms") }
| (num as n) "s" { TIME (int_of_string n, "s") }
| (num as n) "hz" { FREQ (int_of_string n, "hz") }
| (num as n) "khz" { FREQ (int_of_string n, "khz") }
| (num as n) "%" { PERCENTAGE (int_of_string n) }
| (num as n) (ident as dim) { DIMENSION (int_of_string n, dim) }
| num as n { NUMBER (int_of_string n) }
| "url(" w (mystring as uri) w ")" { URI uri }
| "url(" w (url as uri) w ")" { URI uri }
| baduri as uri { BAD_URI uri }
| (ident as fn) '(' { FUNCTION fn }
| '(' { LPAREN }
| ')' { RPAREN }
| '{' { LBRACE }
| '}' { RBRACE }
| '[' { LBRACK }
| ']' { RBRACK }
| ';' { SEMICOL }
| ':' { COLON }
(*
| _ as c { raise (SyntaxError ("illegal string character: " ^ Char.escaped c)) }
*)
......@@ -6,10 +6,12 @@ let prop2str (name, value) = name ^ ":" ^ Stringify.value2str value
%}
(* Tokens *)
%token LPAREN RPAREN LBRACE RBRACE SEMICOL COMMA COLON
%token MEDIA IMPORT CHARSET PAGE FONTFACE NAMESPACE
%token IMPORTANT EOF
%token <string> ID STRING SELECTOR
%token S CDO CDC INCLUDES DASHMATCH STRING BAD_STRING IMPORT_SYM PAGE_SYM
%token MEDIA_SYM CHARSET_SYM IMPORTANT_SYM
%token LPAREN RPAREN LBRACE RBRACE LBRACK RBRACK SEMICOL COLON
%token <int> EMS EXS PERCENTAGE NUMBER
%token <int * string> LENGTH ANGLE TIME FREQ DIMENSION
%token <string> IDENT HASH URI BAD_URI FUNCTION
(* Start symbol *)
%type <Types.decl list> stylesheet
......@@ -18,68 +20,43 @@ let prop2str (name, value) = name ^ ":" ^ Stringify.value2str value
%%
(* Left-recursive list (use List.rev to obtain correctly ordered list) *)
(*
llist(x):
| { [] }
| tl=llist(x) hd=x { hd :: tl }
separated_llist(sep, x):
| { [] }
| tl=llist(x) sep hd=x { hd :: tl }
*)
stylesheet:
| decls=llist(decl) EOF
{ List.rev decls }
selector:
| id=ID { [id] }
| id=SELECTOR { [id] }
| tl=selector hd=ID { hd :: tl }
| tl=selector hd=SELECTOR { hd :: tl }
value:
| str=STRING { Str str }
| lit=ID { Lit lit }
| name=ID LPAREN arg=value RPAREN { Fn (name, arg) }
| IMPORTANT { Imp }
| ( CDO | CDC | S | statement )*
prop:
| name=ID COLON v=value+
{ (name, match v with [hd] -> hd | _ -> Lst v) }
statement:
| ruleset
| at_rule
propline:
| p=prop SEMICOL
{ p }
at_rule:
| ATKEYWORD S* any* ( block | SEMICOL S* )
props:
| LBRACE p=llist(propline) last=prop? RBRACE
{ List.rev p @ (match last with None -> [] | Some p -> [p]) }
block:
| LBRACE S* ( any | block | ATKEYWORD S* | SEMICOL S* )* RBRACE S*
group:
| s=separated_nonempty_list(COMMA, selector) p=props
{ Group (List.rev s, p) }
ruleset:
| selectors=any+ LBRACE S* declaration? ( SEMICOL S* declaration? )* RBRACE S*
%inline media:
| m=ID
{ m }
| LPAREN p=prop RPAREN
{ "(" ^ prop2str p ^ ")" }
declaration:
| name=IDENT S* COLON S* value=value
{ Property (name, value) }
%inline stringopt: f=STRING | f=ID { f }
decl:
| g=group
{ g }
| MEDIA queries=separated_nonempty_list(COMMA, media) LBRACE groups=llist(group) RBRACE
{ Media (queries, List.rev groups) }
| IMPORT f=stringopt q=separated_list(COMMA, ID) SEMICOL
{ Import (f, q) }
| CHARSET c=stringopt SEMICOL
{ Charset c }
| PAGE query=ID? p=props
{ Page (query, p) }
| FONTFACE p=props
{ Fontface p }
| NAMESPACE prefix=ID? uri=STRING SEMICOL
{ Namespace (prefix, uri) }
%%
value:
| ( any | block | ATKEYWORD S* )+
any:
| ( IDENT | NUMBER | PERCENTAGE | DIMENSION | STRING | DELIM | URI | HASH |
UNICODE-RANGE | INCLUDES | DASHMATCH | COLON | FUNCTION S* (any|unused)*
RPAREN | LPAREN S* (any|unused)* RPAREN | LBRACK S* (any|unused)* RBRACK) S*
unused:
| block
| ATKEYWORD S*
| SEMICOL S*
| CDO S*
| CDC S*
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment