lexer.mll 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. {
  2. (* Tokenizer according to definition at
  3. * http://www.w3.org/TR/CSS2/syndata.html#tokenization *)
  4. open Lexing
  5. open Parser
  6. open Types
  7. let next_line lexbuf =
  8. let pos = lexbuf.lex_curr_p in
  9. lexbuf.lex_curr_p <- {
  10. pos with pos_bol = lexbuf.lex_curr_pos;
  11. pos_lnum = pos.pos_lnum + 1
  12. }
  13. let strip_quotes s = String.sub s 1 (String.length s - 2)
  14. }
  15. let h = ['0'-'9' 'a'-'f' 'A'-'F']
  16. let wc = '\r' '\n' | [' ' '\t' '\r' '\n' '\012']
  17. let nonascii = ['\160'-'\255']
  18. let s = [' ' '\t' '\r' '\n' '\012']+
  19. let w = s?
  20. let nl = '\n' | '\r' '\n' | '\r' | '\012'
  21. let unicode = '\\' h(h(h(h(h(h)?)?)?)?)? wc?
  22. let escape = unicode | '\\'[^'\r' '\n' '\012' '0'-'9' 'a'-'f' 'A'-'F']
  23. let nmstart = ['_' 'a'-'z' 'A'-'Z'] | nonascii | escape
  24. let nmchar = ['_' 'a'-'z' 'A'-'Z' '0'-'9' '-'] | nonascii | escape
  25. let string1 = '"' ([^'\n' '\r' '\012' '"'] | '\\'nl | escape)* '"'
  26. let string2 = '\'' ([^'\n' '\r' '\012' '\''] | '\\' nl | escape)* '\''
  27. let mystring = string1 | string2
  28. let badstring1 = '"' ([^'\n' '\r' '\012' '"'] | '\\'nl | escape)* '\\'?
  29. let badstring2 = '\'' ([^'\n' '\r' '\012' '\''] | '\\'nl | escape)* '\\'?
  30. let badstring = badstring1 | badstring2
  31. let badcomment1 = "/*" [^'*']* '*'+ ([^'/' '*'] [^'*']* '*'+)*
  32. let badcomment2 = "/*" [^'*']* ('*'+ [^'/' '*'] [^'*']*)*
  33. let badcomment = badcomment1 | badcomment2
  34. let baduri1 = "url(" w (['!' '#' '$' '%' '&' '*'-'[' ']'-'~'] | nonascii | escape)* w
  35. let baduri2 = "url(" w mystring w
  36. let baduri3 = "url(" w badstring
  37. let baduri = baduri1 | baduri2 | baduri3
  38. let comment = "/*" [^'*']* '*'+ ([^'/' '*'] [^'*']* '*'+)* '/'
  39. let ident = '-'? nmstart nmchar*
  40. let name = nmchar+
  41. let num = ['0'-'9']+ | ['0'-'9']* '.' ['0'-'9']+
  42. let url = (['!' '#' '$' '%' '&' '*'-'~'] | nonascii | escape)*
  43. let A = ['a' 'A']
  44. let B = ['b' 'B']
  45. let C = ['c' 'C']
  46. let D = ['d' 'D']
  47. let E = ['e' 'E']
  48. let F = ['f' 'F']
  49. let G = ['g' 'G']
  50. let H = ['h' 'H']
  51. let I = ['i' 'I']
  52. let J = ['j' 'J']
  53. let K = ['k' 'K']
  54. let L = ['l' 'L']
  55. let M = ['m' 'M']
  56. let N = ['n' 'N']
  57. let O = ['o' 'O']
  58. let P = ['p' 'P']
  59. let Q = ['q' 'Q']
  60. let R = ['r' 'R']
  61. let S = ['s' 'S']
  62. let T = ['t' 'T']
  63. let U = ['u' 'U']
  64. let V = ['v' 'V']
  65. let W = ['w' 'W']
  66. let X = ['x' 'X']
  67. let Y = ['y' 'Y']
  68. let Z = ['z' 'Z']
  69. let uagent = ('-' ("webkit" | "moz" | "ms" | "o") '-')?
  70. rule token = parse
  71. | s { S }
  72. | comment (* ignore comments *)
  73. | badcomment { token lexbuf } (* unclosed comment at EOF *)
  74. | "<!--" { CDO }
  75. | "-->" { CDC }
  76. | ['~' '^' '$' '*' '|']? '=' as op { RELATION op }
  77. | ['>' '~'] as c { COMBINATOR (Char.escaped c) }
  78. | mystring as s { STRING (strip_quotes s) }
  79. | badstring { raise (SyntaxError "bad string") }
  80. | '#' (name as nm) { HASH nm }
  81. | '@' I M P O R T { IMPORT_SYM }
  82. | '@' P A G E { PAGE_SYM }
  83. | '@' M E D I A { MEDIA_SYM }
  84. | "@charset " { CHARSET_SYM }
  85. | '@' F O N T '-' F A C E { FONT_FACE_SYM }
  86. | '@' N A M E S P A C E { NAMESPACE_SYM }
  87. | '@' uagent K E Y F R A M E S { KEYFRAMES_SYM }
  88. | '@' S U P P O R T S { SUPPORTS_SYM }
  89. | (w | comment)* w A N D w (w | comment)* { SUPPORTS_AND }
  90. | (w | comment)* w O R w (w | comment)* { SUPPORTS_OR }
  91. | O N L Y { ONLY }
  92. | N O T { NOT }
  93. | A N D { AND }
  94. (*| O R { OR } removed in favor of SUPPORTS_OR *)
  95. | F R O M { FROM }
  96. | T O { TO }
  97. | ident as id { IDENT id }
  98. | '!' (w | comment)* I M P O R T A N T { IMPORTANT_SYM }
  99. | (num as n) '%' { PERCENTAGE (float_of_string n) }
  100. | (num as n) (E M | E X | P X | C M | M M | I N | P T | P C | D E G |
  101. G? R A D | M? S | K? H Z | D P (I | C M) | ident as u)
  102. { UNIT_VALUE (float_of_string n, u) }
  103. | num as n { NUMBER (float_of_string n) }
  104. | "url(" w (mystring as uri) w ")" { URI (strip_quotes uri) }
  105. | "url(" w (url as uri) w ")" { URI uri }
  106. | baduri { raise (SyntaxError "bad uri") }
  107. | (ident as fn) '(' { FUNCTION fn }
  108. | '(' { LPAREN }
  109. | ')' { RPAREN }
  110. | '{' { LBRACE }
  111. | '}' { RBRACE }
  112. | '[' { LBRACK }
  113. | ']' { RBRACK }
  114. | ';' { SEMICOL }
  115. | ':' { COLON }
  116. | ',' { COMMA }
  117. | '.' { DOT }
  118. | '+' { PLUS }
  119. | '-' { MINUS }
  120. | '/' { SLASH }
  121. | '*' { STAR }
  122. | eof | '\000' { EOF }
  123. | _ { token lexbuf }
  124. | _ as c { raise (SyntaxError ("unexpected '" ^ Char.escaped c ^ "'")) }