lexer.mll 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. {
  2. (* Tokenizer according to definition at
  3. * http://www.w3.org/TR/CSS2/syndata.html#tokenization *)
  4. open Lexing
  5. open Parser
  6. open Types
  7. let advance_pos lexbuf =
  8. let s = Lexing.lexeme lexbuf in
  9. let rec search from lines =
  10. try
  11. ignore (Str.search_forward (Str.regexp "\r\n\\|\r\\|\n") s from);
  12. search (Str.match_end ()) (lines + 1)
  13. with Not_found ->
  14. lines, String.length s - from
  15. in
  16. let lines, cols = search 0 0 in
  17. let pos = lexbuf.lex_curr_p in
  18. lexbuf.lex_curr_p <- {
  19. pos with pos_bol = lexbuf.lex_curr_pos - cols;
  20. pos_lnum = pos.pos_lnum + lines
  21. }
  22. let strip_quotes s = String.sub s 1 (String.length s - 2)
  23. }
  24. let h = ['0'-'9' 'a'-'f' 'A'-'F']
  25. let wc = '\r' '\n' | [' ' '\t' '\r' '\n' '\012']
  26. let nonascii = ['\160'-'\255']
  27. let s = [' ' '\t' '\r' '\n' '\012']+
  28. let w = s?
  29. let nl = '\n' | '\r' '\n' | '\r' | '\012'
  30. let unicode = '\\' h(h(h(h(h(h)?)?)?)?)? wc?
  31. let escape = unicode | '\\'[^'\r' '\n' '\012' '0'-'9' 'a'-'f' 'A'-'F']
  32. let nmstart = ['_' 'a'-'z' 'A'-'Z'] | nonascii | escape
  33. let nmchar = ['_' 'a'-'z' 'A'-'Z' '0'-'9' '-'] | nonascii | escape
  34. let string1 = '"' ([^'\n' '\r' '\012' '"'] | '\\'nl | escape)* '"'
  35. let string2 = '\'' ([^'\n' '\r' '\012' '\''] | '\\' nl | escape)* '\''
  36. let mystring = string1 | string2
  37. let badstring1 = '"' ([^'\n' '\r' '\012' '"'] | '\\'nl | escape)* '\\'?
  38. let badstring2 = '\'' ([^'\n' '\r' '\012' '\''] | '\\'nl | escape)* '\\'?
  39. let badstring = badstring1 | badstring2
  40. let badcomment1 = "/*" [^'*']* '*'+ ([^'/' '*'] [^'*']* '*'+)*
  41. let badcomment2 = "/*" [^'*']* ('*'+ [^'/' '*'] [^'*']*)*
  42. let badcomment = badcomment1 | badcomment2
  43. let baduri1 = "url(" w (['!' '#' '$' '%' '&' '*'-'[' ']'-'~'] | nonascii | escape)* w
  44. let baduri2 = "url(" w mystring w
  45. let baduri3 = "url(" w badstring
  46. let baduri = baduri1 | baduri2 | baduri3
  47. let comment = "/*" [^'*']* '*'+ ([^'/' '*'] [^'*']* '*'+)* '/'
  48. let ident = '-'? nmstart nmchar*
  49. let name = nmchar+
  50. let num = ['0'-'9']+ | ['0'-'9']* '.' ['0'-'9']+
  51. let url = (['!' '#' '$' '%' '&' '*'-'~'] | nonascii | escape)*
  52. let A = ['a' 'A']
  53. let B = ['b' 'B']
  54. let C = ['c' 'C']
  55. let D = ['d' 'D']
  56. let E = ['e' 'E']
  57. let F = ['f' 'F']
  58. let G = ['g' 'G']
  59. let H = ['h' 'H']
  60. let I = ['i' 'I']
  61. let J = ['j' 'J']
  62. let K = ['k' 'K']
  63. let L = ['l' 'L']
  64. let M = ['m' 'M']
  65. let N = ['n' 'N']
  66. let O = ['o' 'O']
  67. let P = ['p' 'P']
  68. let Q = ['q' 'Q']
  69. let R = ['r' 'R']
  70. let S = ['s' 'S']
  71. let T = ['t' 'T']
  72. let U = ['u' 'U']
  73. let V = ['v' 'V']
  74. let W = ['w' 'W']
  75. let X = ['x' 'X']
  76. let Y = ['y' 'Y']
  77. let Z = ['z' 'Z']
  78. rule token = parse
  79. | "\r\n" | '\r' | '\n' { new_line lexbuf; S }
  80. | [' ' '\t' '\012']+ { S }
  81. | "/*" { comment lexbuf }
  82. | "<!--" { CDO }
  83. | "-->" { CDC }
  84. | ['~' '^' '$' '*' '|']? '=' as op { RELATION op }
  85. | ['>' '~'] as c { COMBINATOR (Char.escaped c) }
  86. | mystring as s { STRING (strip_quotes s) }
  87. | badstring { raise (Syntax_error "bad string") }
  88. | '#' (name as nm) { HASH nm }
  89. | '@' I M P O R T { IMPORT_SYM }
  90. | '@' P A G E { PAGE_SYM }
  91. | '@' M E D I A { MEDIA_SYM }
  92. | "@charset " { CHARSET_SYM }
  93. | '@' F O N T '-' F A C E { FONT_FACE_SYM }
  94. | '@' N A M E S P A C E { NAMESPACE_SYM }
  95. | '@' S U P P O R T S { SUPPORTS_SYM }
  96. | '@' (('-' ident '-')? as prefix) K E Y F R A M E S
  97. { KEYFRAMES_SYM (String.lowercase prefix) }
  98. | '@' (('-' ident '-')? as prefix) V I E W P O R T
  99. { VIEWPORT_SYM (String.lowercase prefix) }
  100. | (s | comment)* s comment* A N D comment* s (s | comment)*
  101. { advance_pos lexbuf; WS_AND }
  102. | (s | comment)* s comment* O R comment* s (s | comment)*
  103. { advance_pos lexbuf; WS_OR }
  104. | O N L Y { ONLY }
  105. | N O T { NOT }
  106. | A N D { AND }
  107. (*| O R { OR } removed in favor of WS_OR *)
  108. | F R O M { FROM }
  109. | T O { TO }
  110. | ident as id { IDENT id }
  111. | '!' (s | comment)* I M P O R T A N T { IMPORTANT_SYM }
  112. | (num as n) '%' { PERCENTAGE (float_of_string n) }
  113. | (num as n) (E M | E X | P X | C M | M M | I N | P T | P C | D E G |
  114. G? R A D | M? S | K? H Z | D P (I | C M) | ident as u)
  115. { UNIT_VALUE (float_of_string n, u) }
  116. | num as n { NUMBER (float_of_string n) }
  117. | "url(" w (mystring as uri) w ")" { advance_pos lexbuf; URI (strip_quotes uri) }
  118. | "url(" w (url as uri) w ")" { advance_pos lexbuf; URI uri }
  119. | baduri { raise (Syntax_error "bad uri") }
  120. (*
  121. | "url(" { url_start lexbuf }
  122. *)
  123. | (ident as fn) '(' { FUNCTION fn }
  124. | '(' { LPAREN }
  125. | ')' { RPAREN }
  126. | '{' { LBRACE }
  127. | '}' { RBRACE }
  128. | '[' { LBRACK }
  129. | ']' { RBRACK }
  130. | ';' { SEMICOL }
  131. | ':' { COLON }
  132. | "::" { DOUBLE_COLON }
  133. | ',' { COMMA }
  134. | '.' { DOT }
  135. | '+' { PLUS }
  136. | '-' { MINUS }
  137. | '/' { SLASH }
  138. | '*' { STAR }
  139. | eof | '\000' { EOF }
  140. | _ as c { raise (Syntax_error ("unexpected '" ^ Char.escaped c ^ "'")) }
  141. (* Comments *)
  142. and comment = parse
  143. | '\r' | '\n' | "\r\n" { new_line lexbuf; comment lexbuf }
  144. | "*/" { token lexbuf }
  145. | eof | '\000' { raise (Syntax_error "unclosed comment") }
  146. | _ { comment lexbuf }
  147. (*
  148. (* URLs *)
  149. and url_start = parse
  150. | '\r' | '\n' | "\r\n" { new_line lexbuf; url_start lexbuf }
  151. | [' ' '\t' '\012']+ { url_start lexbuf }
  152. | urlc+ as uri { url_end uri lexbuf }
  153. | ')' { URI "" }
  154. | mystring as s { url_end (strip_quotes s) lexbuf }
  155. | badstring { raise (Syntax_error "bad string") }
  156. | (eof | '\000' | _) as c { raise (Syntax_error ("unexpected '" ^ c ^ "'")) }
  157. and url_end uri = parse
  158. | '\r' | '\n' | "\r\n" { new_line lexbuf; url_end uri lexbuf }
  159. | [' ' '\t' '\012']+ { url_end uri lexbuf }
  160. | ')' { URI uri }
  161. | (eof | '\000' | _) as c { raise (Syntax_error ("unexpected '" ^ c ^ "'")) }
  162. *)