Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add the list of opened elements to report #75

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
- uses: ocaml/setup-ocaml@v2
with:
ocaml-compiler: ${{matrix.ocaml}}
- run: sudo apt-get install python-bs4
- run: sudo apt-get install python3-bs4
- run: opam install --deps-only --with-test . --yes
- run: opam install js_of_ocaml --yes

Expand Down
2 changes: 2 additions & 0 deletions src/common.ml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ let compare_locations (line, column) (line', column') =
| order -> order

type name = string * string
type attributes = (name * string) list
type open_elements = (name * location * attributes) list

let xml_ns = "http://www.w3.org/XML/1998/namespace"
let xmlns_ns = "http://www.w3.org/2000/xmlns/"
Expand Down
23 changes: 16 additions & 7 deletions src/html_parser.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1015,19 +1015,14 @@ end



let parse requested_context report (tokens, set_tokenizer_state, set_foreign) =
let parse ?(get_opens=ref None) requested_context report
(tokens, set_tokenizer_state, set_foreign) =
let context = Context.uninitialized () in

let throw = ref (fun _ -> ()) in
let ended = ref (fun _ -> ()) in
let output = ref (fun _ -> ()) in

let report_if = Error.report_if report in
let unmatched_end_tag l name k =
report l (`Unmatched_end_tag name) !throw k in
let misnested_tag l t context_name k =
report l (`Misnested_tag (t.name, context_name, t.Token_tag.attributes)) !throw k in

let open_elements = Stack.create () in
let active_formatting_elements = Active.create () in
let subtree_buffer = Subtree.create open_elements in
Expand All @@ -1042,6 +1037,20 @@ let parse requested_context report (tokens, set_tokenizer_state, set_foreign) =
set_foreign (fun () ->
Stack.current_element_is_foreign context open_elements);

let opens () = List.map
(fun e ->
let (ns, s) = e.element_name in
((Ns.to_string ns, s), e.location, e.attributes))
!open_elements
in
get_opens := Some opens;
let report l error throw k = report (opens ()) l error throw k in
let report_if = Error.report_if report in
let unmatched_end_tag l name k =
report l (`Unmatched_end_tag name) !throw k in
let misnested_tag l t context_name k =
report l (`Misnested_tag (t.name, context_name, t.Token_tag.attributes)) !throw k in

let report_if_stack_has_other_than names k =
let rec iterate = function
| [] -> k ()
Expand Down
3 changes: 2 additions & 1 deletion src/html_parser.mli
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
open Common

val parse :
?get_opens:((unit -> open_elements) option ref) ->
[< `Document | `Fragment of string ] option ->
Error.parse_handler ->
(open_elements -> Error.parse_handler) ->
(location * Html_tokenizer.token) Kstream.t *
(Html_tokenizer.state -> unit) *
((unit -> bool) -> unit) ->
Expand Down
51 changes: 37 additions & 14 deletions src/markup.ml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ module Error = Error


type name = Common.name
type attributes = (name * string) list
type open_elements = (name * location * attributes) list

type xml_declaration = Common.xml_declaration =
{version : string;
Expand Down Expand Up @@ -91,11 +93,15 @@ struct
let parse_xml
report ?encoding namespace entity context source =
let with_encoding (encoding : Encoding.t) k =
let get_opens = ref None in
let parse = Xml_parser.parse ~get_opens context namespace report in
let report' x =
report (match !get_opens with None -> [] | Some f -> f ()) x in
source
|> encoding ~report
|> Input.preprocess Common.is_valid_xml_char report
|> Xml_tokenizer.tokenize report entity
|> Xml_parser.parse context namespace report
|> encoding ~report:report'
|> Input.preprocess Common.is_valid_xml_char report'
|> Xml_tokenizer.tokenize report' entity
|> parse
|> k
in

Expand All @@ -116,12 +122,16 @@ struct
|> Utility.strings_to_bytes

let parse_html report ?encoding context source =
let get_opens = ref None in
let parse = Html_parser.parse ~get_opens context report in
let report' x =
report (match !get_opens with None -> [] | Some f -> f ()) x in
let with_encoding (encoding : Encoding.t) k =
source
|> encoding ~report
|> Input.preprocess Common.is_valid_html_char report
|> Html_tokenizer.tokenize report
|> Html_parser.parse context report
|> encoding ~report:report'
|> Input.preprocess Common.is_valid_html_char report'
|> Html_tokenizer.tokenize report'
|> parse
|> k
in

Expand Down Expand Up @@ -190,6 +200,7 @@ sig

val parse_xml :
?report:(location -> Error.t -> unit io) ->
?detailed_report:(open_elements -> location -> Error.t -> unit io) ->
?encoding:Encoding.t ->
?namespace:(string -> string option) ->
?entity:(string -> string option) ->
Expand All @@ -203,6 +214,7 @@ sig

val parse_html :
?report:(location -> Error.t -> unit io) ->
?detailed_report:(open_elements -> location -> Error.t -> unit io) ->
?encoding:Encoding.t ->
?context:[< `Document | `Fragment of string ] ->
(char, _) stream -> async parser
Expand Down Expand Up @@ -248,6 +260,7 @@ end

module Asynchronous (IO : IO) =
struct
let wrap_report_ops report = fun ops l e -> IO.to_cps (fun () -> report ops l e)
let wrap_report report = fun l e -> IO.to_cps (fun () -> report l e)

module Encoding =
Expand All @@ -259,15 +272,20 @@ struct
end

let parse_xml
?(report = fun _ _ -> IO.return ())
?report ?detailed_report
?encoding
?(namespace = fun _ -> None)
?(entity = fun _ -> None)
?context
source =

let report = match detailed_report, report with
| Some f, None -> f
| None, Some f -> (fun _ -> f)
| Some _, Some _ -> invalid_arg "both report and detailed_report given"
| None, None -> (fun _ _ _ -> IO.return ())
in
Cps.parse_xml
(wrap_report report) ?encoding namespace entity context source
(wrap_report_ops report) ?encoding namespace entity context source

let write_xml
?(report = fun _ _ -> IO.return ())
Expand All @@ -277,12 +295,17 @@ struct
Cps.write_xml (wrap_report report) prefix signals

let parse_html
?(report = fun _ _ -> IO.return ())
?report ?detailed_report
?encoding
?context
source =

Cps.parse_html (wrap_report report) ?encoding context source
let report = match detailed_report, report with
| Some f, None -> f
| None, Some f -> (fun _ -> f)
| Some _, Some _ -> invalid_arg "both report and detailed_report given"
| None, None -> (fun _ _ _ -> IO.return ())
in
Cps.parse_html (wrap_report_ops report) ?encoding context source

let write_html ?escape_attribute ?escape_text signals =
Cps.write_html ?escape_attribute ?escape_text signals
Expand Down
11 changes: 11 additions & 0 deletions src/markup.mli
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,9 @@ end
(** {2 Signals} *)

type name = string * string
type attributes = (name * string) list
type open_elements = (name * location * attributes) list

(** Expanded name: a namespace URI followed by a local name. *)

type xml_declaration =
Expand Down Expand Up @@ -321,6 +324,7 @@ val location : _ parser -> location

val parse_xml :
?report:(location -> Error.t -> unit) ->
?detailed_report:(open_elements -> location -> Error.t -> unit) ->
?encoding:Encoding.t ->
?namespace:(string -> string option) ->
?entity:(string -> string option) ->
Expand All @@ -334,6 +338,10 @@ val parse_xml :
You may raise an exception in [report], and it will propagate to the code
reading the signal stream.

[~detailed_report] is similar to report, but receive the list of
open_elements. You must only give [~report] or [~detailed_report] but not
both.

If [~encoding] is {e not} specified, the parser detects the input encoding
automatically. Otherwise, the given encoding is used.

Expand Down Expand Up @@ -371,6 +379,7 @@ val write_xml :

val parse_html :
?report:(location -> Error.t -> unit) ->
?detailed_report:(open_elements -> location -> Error.t -> unit) ->
?encoding:Encoding.t ->
?context:[< `Document | `Fragment of string ] ->
(char, 's) stream -> 's parser
Expand Down Expand Up @@ -826,6 +835,7 @@ sig

val parse_xml :
?report:(location -> Error.t -> unit io) ->
?detailed_report:(open_elements -> location -> Error.t -> unit io) ->
?encoding:Encoding.t ->
?namespace:(string -> string option) ->
?entity:(string -> string option) ->
Expand All @@ -841,6 +851,7 @@ sig

val parse_html :
?report:(location -> Error.t -> unit io) ->
?detailed_report:(open_elements -> location -> Error.t -> unit io) ->
?encoding:Encoding.t ->
?context:[< `Document | `Fragment of string ] ->
(char, _) stream -> async parser
Expand Down
17 changes: 11 additions & 6 deletions src/xml_parser.ml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,13 @@ open Token_tag

let is_whitespace_only strings = List.for_all is_whitespace_only strings

let parse context namespace report tokens =
let parse ?(get_opens=ref None) context namespace report tokens =
let open_elements = ref [] in
let opens () =
List.map (fun (l,name,_,attrs) -> (name,l,attrs)) !open_elements
in
get_opens := Some opens;
let report l error throw k = report (opens ()) l error throw k in
let namespaces = Namespace.Parsing.init namespace in
let is_fragment = ref false in
let fragment_allowed = ref true in
Expand Down Expand Up @@ -45,7 +50,7 @@ let parse context namespace report tokens =
in

deduplicate [] attributes (fun attributes ->
open_elements := (l, expanded_name, raw_name)::!open_elements;
open_elements := (l, expanded_name, raw_name, attributes)::!open_elements;
emit l (`Start_element (expanded_name, attributes)) state))

and pop l state =
Expand Down Expand Up @@ -190,21 +195,21 @@ let parse context namespace report tokens =

let is_on_stack =
!open_elements
|> List.exists (fun (_, name, _) -> name = expanded_name)
|> List.exists (fun (_, name, _, _) -> name = expanded_name)
in

if not is_on_stack then
report l (`Unmatched_end_tag raw_name) !throw content_state
else
let rec pop_until_match () =
match !open_elements with
| (_, name, _)::_ when name = expanded_name ->
| (_, name, _, _)::_ when name = expanded_name ->
pop l (fun () ->
match !open_elements with
| [] when not !is_fragment -> after_root_state ()
| _ -> content_state ())

| (l', _, name)::_ ->
| (l', _, name, _)::_ ->
report l' (`Unmatched_start_tag name) !throw (fun () ->
pop l pop_until_match)

Expand All @@ -225,7 +230,7 @@ let parse context namespace report tokens =
let rec pop_stack () =
match !open_elements with
| [] -> emit_end ()
| (l', _, raw_name)::_ ->
| (l', _, raw_name, _)::_ ->
report l' (`Unmatched_start_tag raw_name) !throw (fun () ->
pop l pop_stack)
in
Expand Down
3 changes: 2 additions & 1 deletion src/xml_parser.mli
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
open Common

val parse :
?get_opens : ((unit -> open_elements) option ref) ->
[< `Document | `Fragment ] option ->
(string -> string option) ->
Error.parse_handler ->
(open_elements -> Error.parse_handler) ->
(location * Xml_tokenizer.token) Kstream.t ->
(location * signal) Kstream.t
2 changes: 1 addition & 1 deletion test/test_html_parser.ml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ let expect ?prefix ?(context = Some `Document) text signals =
|> Markup__Encoding.utf_8
|> Markup__Input.preprocess is_valid_html_char Error.ignore_errors
|> Markup__Html_tokenizer.tokenize Error.ignore_errors
|> Markup__Html_parser.parse context report
|> Markup__Html_parser.parse context (fun _ -> report)
|> iter iterate;

ended ()
Expand Down
2 changes: 1 addition & 1 deletion test/test_xml_parser.ml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ let expect ?context ?(namespace = no_top_level_namespaces) text signals =
|> Markup__Encoding.utf_8
|> Markup__Input.preprocess is_valid_xml_char Error.ignore_errors
|> Markup__Xml_tokenizer.tokenize Error.ignore_errors no_custom_entities
|> Markup__Xml_parser.parse context namespace report
|> Markup__Xml_parser.parse context namespace (fun _ -> report)
|> iter iterate;

ended ()
Expand Down