Lexer now correctly tracks line numbers + some general cleanup

70f032a3 · Taddeüs Kroes · cfccb863 · 70f032a3 · 70f032a3 · 70f032a3
Commit 70f032a3 authored Jul 21, 2014 by Taddeüs Kroes
Hide whitespace changes
Inline Side-by-side

Showing with 59 additions and 23 deletions

lexer.mll lexer.mll +51 -15

main.ml main.ml +1 -1

parse.ml parse.ml +3 -3

parser.mly parser.mly +2 -2

types.ml types.ml +2 -2

No files found.
--- a/lexer.mll
+++ b/lexer.mll
@@ -5,11 +5,21 @@
  open Parser
  open Types
-  let next_line lexbuf =
+  let advance_pos lexbuf =
+    let s = Lexing.lexeme lexbuf in
+    let rec search from lines =
+      try
+        ignore (Str.search_forward (Str.regexp "\r\n\\|\r\\|\n") s from);
+        search (Str.match_end ()) (lines + 1)
+      with Not_found ->
+        lines, String.length s - from
+    in
+    let lines, cols = search 0 0 in
    let pos = lexbuf.lex_curr_p in
    lexbuf.lex_curr_p <- {
-      pos with pos_bol = lexbuf.lex_curr_pos;
+      pos with pos_bol = lexbuf.lex_curr_pos - cols;
-              pos_lnum = pos.pos_lnum + 1
+               pos_lnum = pos.pos_lnum + lines
    }
  let strip_quotes s = String.sub s 1 (String.length s - 2)
@@ -75,10 +85,9 @@ let uagent = ('-' ("webkit" | "moz"  | "ms" | "o") '-')?
 rule token = parse
-  | s                   { S }
+  | "\r\n" | '\r' | '\n'  { new_line lexbuf; S }
+  | [' ' '\t' '\012']+  { S }
-  | comment                              (* ignore comments *)
+  | "/*"                { comment lexbuf }
-  | badcomment          { token lexbuf } (* unclosed comment at EOF *)
  | "<!--"              { CDO }
  | "-->"               { CDC }
@@ -86,7 +95,7 @@ rule token = parse
  | ['>' '~'] as c      { COMBINATOR (Char.escaped c) }
  | mystring as s       { STRING (strip_quotes s) }
-  | badstring           { raise (SyntaxError "bad string") }
+  | badstring           { raise (Syntax_error "bad string") }
  | '#' (name as nm)    { HASH nm }
@@ -99,8 +108,8 @@ rule token = parse
  | '@' uagent K E Y F R A M E S  { KEYFRAMES_SYM }
  | '@' S U P P O R T S           { SUPPORTS_SYM }
-  | (w | comment)* w A N D w (w | comment)* { WS_AND }
+  | (s | comment)* s A N D s (s | comment)*  { advance_pos lexbuf; WS_AND }
-  | (w | comment)* w O R w (w | comment)*   { WS_OR }
+  | (s | comment)* s O R   s (s | comment)*  { advance_pos lexbuf; WS_OR }
  | O N L Y             { ONLY }
  | N O T               { NOT }
@@ -111,7 +120,7 @@ rule token = parse
  | ident as id         { IDENT id }
-  | '!' (w | comment)* I M P O R T A N T  { IMPORTANT_SYM }
+  | '!' (s | comment)* I M P O R T A N T  { IMPORTANT_SYM }
  |  (num as n) '%'     { PERCENTAGE (float_of_string n) }
  |  (num as n) (E M | E X | P X | C M | M M | I N | P T | P C | D E G |
@@ -119,9 +128,12 @@ rule token = parse
  { UNIT_VALUE (float_of_string n, u) }
  | num as n            { NUMBER (float_of_string n) }
-  | "url(" w (mystring as uri) w ")"  { URI (strip_quotes uri) }
+  | "url(" w (mystring as uri) w ")"  { advance_pos lexbuf; URI (strip_quotes uri) }
-  | "url(" w (url as uri) w ")"       { URI uri }
+  | "url(" w (url as uri) w ")"       { advance_pos lexbuf; URI uri }
-  | baduri                            { raise (SyntaxError "bad uri") }
+  | baduri              { raise (Syntax_error "bad uri") }
+  (*
+  | "url("              { url_start lexbuf }
+  *)
  | (ident as fn) '('   { FUNCTION fn }
@@ -143,4 +155,28 @@ rule token = parse
  | eof | '\000'        { EOF }
-  | _ as c { raise (SyntaxError ("unexpected '" ^ Char.escaped c ^ "'")) }
+  | _ as c { raise (Syntax_error ("unexpected '" ^ Char.escaped c ^ "'")) }
+(* Comments *)
+and comment = parse
+  | '\r' | '\n' | "\r\n"  { new_line lexbuf; comment lexbuf }
+  | "*/"                  { token lexbuf }
+  | eof | '\000'          { raise (Syntax_error "unclosed comment") }
+  | _                     { comment lexbuf }
+(*
+(* URLs *)
+and url_start = parse
+  | '\r' | '\n' | "\r\n"  { new_line lexbuf; url_start lexbuf }
+  | [' ' '\t' '\012']+    { url_start lexbuf }
+  | urlc+ as uri          { url_end uri lexbuf }
+  | ')'                   { URI "" }
+  | mystring as s         { url_end (strip_quotes s) lexbuf }
+  | badstring             { raise (Syntax_error "bad string") }
+  | (eof | '\000' | _) as c { raise (Syntax_error ("unexpected '" ^ c ^ "'")) }
+and url_end uri = parse
+  | '\r' | '\n' | "\r\n"  { new_line lexbuf; url_end uri lexbuf }
+  | [' ' '\t' '\012']+    { url_end uri lexbuf }
+  | ')'                   { URI uri }
+  | (eof | '\000' | _) as c { raise (Syntax_error ("unexpected '" ^ c ^ "'")) }
+*)
--- a/main.ml
+++ b/main.ml
@@ -89,7 +89,7 @@ let main () =
      handle_args args;
      exit 0
    with
-    | LocError (loc, msg) ->
+    | Loc_error (loc, msg) ->
      Util.prerr_loc_msg (args.verbose >= 1) loc ("Error: " ^ msg);
    | Failure err ->
      prerr_endline ("Error: " ^ err);

--- a/parse.ml
+++ b/parse.ml
@@ -27,7 +27,7 @@ let parse_input display_name content =
  let lexbuf = Lexing.from_string content in
  lexbuf.lex_curr_p <- { lexbuf.lex_curr_p with pos_fname = display_name };
  try Parser.stylesheet Lexer.token lexbuf with
-  | SyntaxError msg ->
+  | Syntax_error msg ->
-    raise (LocError (shift_back lexbuf, msg))
+    raise (Loc_error (shift_back lexbuf, msg))
  | Parser.Error ->
-    raise (LocError (shift_back lexbuf, "syntax error"))
+    raise (Loc_error (shift_back lexbuf, "syntax error"))
--- a/parser.mly
+++ b/parser.mly
@@ -33,7 +33,7 @@
      | Term left :: Operator op :: Term right :: tl ->
        transform_ops (Term (Nary (op, [left; right])) :: tl)
      | Term hd :: tl -> hd :: transform_ops tl
-      | Operator op :: _ -> raise (SyntaxError ("unexpected operator \"" ^ op ^ "\""))
+      | Operator op :: _ -> raise (Syntax_error ("unexpected operator \"" ^ op ^ "\""))
    in
    let rec flatten_nary = function
      | [] -> []
@@ -277,7 +277,7 @@ term:
  { let h = "[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]" in
    if Str.string_match (Str.regexp ("^" ^ h ^ "\\(" ^ h ^ "\\)?$")) hex 0
      then Hexcolor (String.lowercase hex)
-      else raise (SyntaxError ("invalid color #" ^ hex)) }
+      else raise (Syntax_error ("invalid color #" ^ hex)) }
 unary_operator:
  | MINUS  { "-" }
  | PLUS   { "+" }

--- a/types.ml
+++ b/types.ml
@@ -57,6 +57,6 @@ type stylesheet = statement list
 type loc = string * int * int * int * int
-exception SyntaxError of string
+exception Syntax_error of string
-exception LocError of loc * string
+exception Loc_error of loc * string