From 70f032a31d397140fd46ddce8de362d4f55eeb61 Mon Sep 17 00:00:00 2001 From: Taddeus Kroes <taddeuskroes@gmail.com> Date: Mon, 21 Jul 2014 15:06:55 +0200 Subject: [PATCH] Lexer now correctly tracks line numbers + some general cleanup --- lexer.mll | 66 +++++++++++++++++++++++++++++++++++++++++------------- main.ml | 2 +- parse.ml | 6 ++--- parser.mly | 4 ++-- types.ml | 4 ++-- 5 files changed, 59 insertions(+), 23 deletions(-) diff --git a/lexer.mll b/lexer.mll index ddcb346..3b2a870 100644 --- a/lexer.mll +++ b/lexer.mll @@ -5,11 +5,21 @@ open Parser open Types - let next_line lexbuf = + let advance_pos lexbuf = + let s = Lexing.lexeme lexbuf in + let rec search from lines = + try + ignore (Str.search_forward (Str.regexp "\r\n\\|\r\\|\n") s from); + search (Str.match_end ()) (lines + 1) + with Not_found -> + lines, String.length s - from + in + let lines, cols = search 0 0 in + let pos = lexbuf.lex_curr_p in lexbuf.lex_curr_p <- { - pos with pos_bol = lexbuf.lex_curr_pos; - pos_lnum = pos.pos_lnum + 1 + pos with pos_bol = lexbuf.lex_curr_pos - cols; + pos_lnum = pos.pos_lnum + lines } let strip_quotes s = String.sub s 1 (String.length s - 2) @@ -75,10 +85,9 @@ let uagent = ('-' ("webkit" | "moz" | "ms" | "o") '-')? rule token = parse - | s { S } - - | comment (* ignore comments *) - | badcomment { token lexbuf } (* unclosed comment at EOF *) + | "\r\n" | '\r' | '\n' { new_line lexbuf; S } + | [' ' '\t' '\012']+ { S } + | "/*" { comment lexbuf } | "<!--" { CDO } | "-->" { CDC } @@ -86,7 +95,7 @@ rule token = parse | ['>' '~'] as c { COMBINATOR (Char.escaped c) } | mystring as s { STRING (strip_quotes s) } - | badstring { raise (SyntaxError "bad string") } + | badstring { raise (Syntax_error "bad string") } | '#' (name as nm) { HASH nm } @@ -99,8 +108,8 @@ rule token = parse | '@' uagent K E Y F R A M E S { KEYFRAMES_SYM } | '@' S U P P O R T S { SUPPORTS_SYM } - | (w | comment)* w A N D w (w | comment)* { WS_AND } - | (w | comment)* w O R w (w | comment)* { WS_OR } + | (s | comment)* s A N D s (s | comment)* { advance_pos lexbuf; WS_AND } + | (s | comment)* s O R s (s | comment)* { advance_pos lexbuf; WS_OR } | O N L Y { ONLY } | N O T { NOT } @@ -111,7 +120,7 @@ rule token = parse | ident as id { IDENT id } - | '!' (w | comment)* I M P O R T A N T { IMPORTANT_SYM } + | '!' (s | comment)* I M P O R T A N T { IMPORTANT_SYM } | (num as n) '%' { PERCENTAGE (float_of_string n) } | (num as n) (E M | E X | P X | C M | M M | I N | P T | P C | D E G | @@ -119,9 +128,12 @@ rule token = parse { UNIT_VALUE (float_of_string n, u) } | num as n { NUMBER (float_of_string n) } - | "url(" w (mystring as uri) w ")" { URI (strip_quotes uri) } - | "url(" w (url as uri) w ")" { URI uri } - | baduri { raise (SyntaxError "bad uri") } + | "url(" w (mystring as uri) w ")" { advance_pos lexbuf; URI (strip_quotes uri) } + | "url(" w (url as uri) w ")" { advance_pos lexbuf; URI uri } + | baduri { raise (Syntax_error "bad uri") } + (* + | "url(" { url_start lexbuf } + *) | (ident as fn) '(' { FUNCTION fn } @@ -143,4 +155,28 @@ rule token = parse | eof | '\000' { EOF } - | _ as c { raise (SyntaxError ("unexpected '" ^ Char.escaped c ^ "'")) } + | _ as c { raise (Syntax_error ("unexpected '" ^ Char.escaped c ^ "'")) } + +(* Comments *) +and comment = parse + | '\r' | '\n' | "\r\n" { new_line lexbuf; comment lexbuf } + | "*/" { token lexbuf } + | eof | '\000' { raise (Syntax_error "unclosed comment") } + | _ { comment lexbuf } + +(* +(* URLs *) +and url_start = parse + | '\r' | '\n' | "\r\n" { new_line lexbuf; url_start lexbuf } + | [' ' '\t' '\012']+ { url_start lexbuf } + | urlc+ as uri { url_end uri lexbuf } + | ')' { URI "" } + | mystring as s { url_end (strip_quotes s) lexbuf } + | badstring { raise (Syntax_error "bad string") } + | (eof | '\000' | _) as c { raise (Syntax_error ("unexpected '" ^ c ^ "'")) } +and url_end uri = parse + | '\r' | '\n' | "\r\n" { new_line lexbuf; url_end uri lexbuf } + | [' ' '\t' '\012']+ { url_end uri lexbuf } + | ')' { URI uri } + | (eof | '\000' | _) as c { raise (Syntax_error ("unexpected '" ^ c ^ "'")) } +*) diff --git a/main.ml b/main.ml index 79350fd..eb32bcc 100644 --- a/main.ml +++ b/main.ml @@ -89,7 +89,7 @@ let main () = handle_args args; exit 0 with - | LocError (loc, msg) -> + | Loc_error (loc, msg) -> Util.prerr_loc_msg (args.verbose >= 1) loc ("Error: " ^ msg); | Failure err -> prerr_endline ("Error: " ^ err); diff --git a/parse.ml b/parse.ml index 0e997bc..60049f1 100644 --- a/parse.ml +++ b/parse.ml @@ -27,7 +27,7 @@ let parse_input display_name content = let lexbuf = Lexing.from_string content in lexbuf.lex_curr_p <- { lexbuf.lex_curr_p with pos_fname = display_name }; try Parser.stylesheet Lexer.token lexbuf with - | SyntaxError msg -> - raise (LocError (shift_back lexbuf, msg)) + | Syntax_error msg -> + raise (Loc_error (shift_back lexbuf, msg)) | Parser.Error -> - raise (LocError (shift_back lexbuf, "syntax error")) + raise (Loc_error (shift_back lexbuf, "syntax error")) diff --git a/parser.mly b/parser.mly index e6b3d1d..f984538 100644 --- a/parser.mly +++ b/parser.mly @@ -33,7 +33,7 @@ | Term left :: Operator op :: Term right :: tl -> transform_ops (Term (Nary (op, [left; right])) :: tl) | Term hd :: tl -> hd :: transform_ops tl - | Operator op :: _ -> raise (SyntaxError ("unexpected operator \"" ^ op ^ "\"")) + | Operator op :: _ -> raise (Syntax_error ("unexpected operator \"" ^ op ^ "\"")) in let rec flatten_nary = function | [] -> [] @@ -277,7 +277,7 @@ term: { let h = "[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]" in if Str.string_match (Str.regexp ("^" ^ h ^ "\\(" ^ h ^ "\\)?$")) hex 0 then Hexcolor (String.lowercase hex) - else raise (SyntaxError ("invalid color #" ^ hex)) } + else raise (Syntax_error ("invalid color #" ^ hex)) } unary_operator: | MINUS { "-" } | PLUS { "+" } diff --git a/types.ml b/types.ml index dada6de..d5c7a5c 100644 --- a/types.ml +++ b/types.ml @@ -57,6 +57,6 @@ type stylesheet = statement list type loc = string * int * int * int * int -exception SyntaxError of string +exception Syntax_error of string -exception LocError of loc * string +exception Loc_error of loc * string -- GitLab