Skip to content

Commit

Permalink
Index_format: fix serialization of related_uids with granular_marshal
Browse files Browse the repository at this point in the history
  • Loading branch information
art-w authored and voodoos committed Jan 21, 2025
1 parent dd35ece commit 49a53ad
Show file tree
Hide file tree
Showing 7 changed files with 157 additions and 37 deletions.
2 changes: 1 addition & 1 deletion src/analysis/occurrences.ml
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ let lookup_related_uids_in_indexes ~(config : Mconfig.t) uid =
try
let index = Index_cache.read index_file in
Uid_map.union
(fun _ a b -> Some (Union_find.union ~f:Uid_set.union a b))
(fun _ a b -> Some (Union_find.union a b))
index.related_uids acc
with Index_format.Not_an_index _ | Sys_error _ ->
log ~title "Could not load index %s" index_file;
Expand Down
85 changes: 58 additions & 27 deletions src/index-format/granular_marshal.ml
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
type store = string
module Cache = Hashtbl.Make (Int)

type 'a link = 'a repr ref
type store = { filename : string; cache : any_link Cache.t }

and any_link = Link : 'a link * 'a schema -> any_link

and 'a link = 'a repr ref

and 'a repr =
| Small of 'a
| Serialized of { loc : int }
| Serialized_reused of { loc : int }
| On_disk of { store : store; loc : int; schema : 'a schema }
| In_memory of 'a
| In_memory_reused of 'a
Expand All @@ -19,19 +24,32 @@ let schema_no_sublinks : _ schema = fun _ _ -> ()

let link v = ref (In_memory v)

let rec normalize lnk =
match !lnk with
| Duplicate lnk -> normalize lnk
| _ -> lnk

let read_loc store fd loc schema =
seek_in fd loc;
let v = Marshal.from_channel fd in
let rec iter =
{ yield =
(fun lnk schema ->
match !lnk with
| Serialized { loc } -> lnk := On_disk { store; loc; schema }
| Small v ->
schema iter v;
lnk := In_memory v
| In_memory _ | In_memory_reused _ | On_disk _ -> ()
| Duplicate _ -> invalid_arg "Granular_marshal.read_loc: Duplicate"
| Serialized { loc } -> lnk := On_disk { store; loc; schema }
| Serialized_reused { loc } -> (
match Cache.find store.cache loc with
| Link (lnk', schema') ->
let lnk' = normalize lnk' in
assert (schema == Obj.magic schema');
lnk := Duplicate (Obj.magic lnk')
| exception Not_found ->
lnk := On_disk { store; loc; schema };
Cache.add store.cache loc (Link (lnk, schema)))
| In_memory _ | In_memory_reused _ | On_disk _ | Duplicate _ -> ()
| Placeholder -> invalid_arg "Granular_marshal.read_loc: Placeholder")
}
in
Expand All @@ -47,7 +65,7 @@ let () =
| Some (_, fd) -> close_in fd)

let force_open_store store =
let fd = open_in store in
let fd = open_in store.filename in
last_open_store := Some (store, fd);
fd

Expand All @@ -67,7 +85,8 @@ let fetch_loc store loc schema =
let rec fetch lnk =
match !lnk with
| In_memory v | In_memory_reused v -> v
| Serialized _ | Small _ -> invalid_arg "Granular_marshal.fetch: serialized"
| Serialized _ | Serialized_reused _ | Small _ ->
invalid_arg "Granular_marshal.fetch: serialized"
| Placeholder -> invalid_arg "Granular_marshal.fetch: during a write"
| Duplicate original_lnk ->
let v = fetch original_lnk in
Expand All @@ -78,17 +97,21 @@ let rec fetch lnk =
lnk := In_memory v;
v

let reuse lnk =
match !lnk with
| In_memory v -> lnk := In_memory_reused v
| In_memory_reused _ -> ()
| _ -> invalid_arg "Granular_marshal.reuse: not in memory"

let cache (type a) (module Key : Hashtbl.HashedType with type t = a) =
let module H = Hashtbl.Make (Key) in
let cache = H.create 16 in
fun (lnk : a link) ->
let key = fetch lnk in
match H.find cache key with
| original_lnk ->
(match !original_lnk with
| In_memory v -> original_lnk := In_memory_reused v
| In_memory_reused _ -> ()
| _ -> assert false);
assert (original_lnk != lnk);
reuse original_lnk;
lnk := Duplicate original_lnk
| exception Not_found -> H.add cache key lnk

Expand All @@ -106,43 +129,50 @@ let int_of_binstring s =
let write ?(flags = []) fd root_schema root_value =
let pt_root = pos_out fd in
output_string fd (String.make ptr_size '\000');
let rec iter size restore =
let rec iter size ~placeholders ~restore =
{ yield =
(fun (type a) (lnk : a link) (schema : a schema) : unit ->
match !lnk with
| Serialized _ | Small _ | Placeholder -> ()
| Serialized _ | Serialized_reused _ | Small _ -> ()
| Placeholder -> failwith "big nono"
| In_memory_reused v -> write_child_reused lnk schema v
| Duplicate original_lnk ->
(iter size restore).yield original_lnk schema;
(match !original_lnk with
| Serialized_reused _ -> ()
| In_memory_reused v -> write_child_reused original_lnk schema v
| _ -> failwith "Granular_marshal.write: duplicate not reused");
lnk := !original_lnk
| In_memory v -> write_child lnk schema v size restore
| On_disk _ -> write_child lnk schema (fetch lnk) size restore)
| In_memory v -> write_child lnk schema v size ~placeholders ~restore
| On_disk _ ->
write_child lnk schema (fetch lnk) size ~placeholders ~restore)
}
and write_child : type a. a link -> a schema -> a -> _ =
fun lnk schema v size restore ->
fun lnk schema v size ~placeholders ~restore ->
let v_size = write_children schema v in
if v_size > 1024 then (
lnk := Serialized { loc = pos_out fd };
Marshal.to_channel fd v flags)
else (
size := !size + v_size;
restore := (fun () -> lnk := Small v) :: !restore;
lnk := Placeholder)
placeholders := (fun () -> lnk := Placeholder) :: !placeholders;
restore := (fun () -> lnk := Small v) :: !restore)
and write_children : type a. a schema -> a -> int =
fun schema v ->
let children_size = ref 0 in
let children_restore = ref [] in
schema (iter children_size children_restore) v;
let placeholders = ref [] in
let restore = ref [] in
schema (iter children_size ~placeholders ~restore) v;
List.iter (fun placehold -> placehold ()) !placeholders;
let v_size = Obj.(reachable_words (repr v)) in
List.iter (fun restore -> restore ()) !children_restore;
List.iter (fun restore -> restore ()) !restore;
!children_size + v_size
and write_child_reused : type a. a link -> a schema -> a -> _ =
fun lnk schema v ->
let children_size = ref 0 in
let children_restore = ref [] in
schema (iter children_size children_restore) v;
List.iter (fun restore -> restore ()) !children_restore;
lnk := Serialized { loc = pos_out fd };
let placeholders = ref [] in
let restore = ref [] in
schema (iter children_size ~placeholders ~restore) v;
lnk := Serialized_reused { loc = pos_out fd };
Marshal.to_channel fd v flags
in
let _ : int = write_children root_schema root_value in
Expand All @@ -151,7 +181,8 @@ let write ?(flags = []) fd root_schema root_value =
seek_out fd pt_root;
output_string fd (binstring_of_int root_loc)

let read store fd root_schema =
let read filename fd root_schema =
let store = { filename; cache = Cache.create 0 } in
let root_loc = int_of_binstring (really_input_string fd 8) in
let root_value = read_loc store fd root_loc root_schema in
root_value
4 changes: 4 additions & 0 deletions src/index-format/granular_marshal.mli
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ type 'a link
(** [link v] returns a new link to the in-memory value [v]. *)
val link : 'a -> 'a link

(** [reuse lnk] marks the link as being used more than once, to ensure proper
serialization of DAGs. *)
val reuse : 'a link -> unit

(** [cache (module Hash)] returns a function to de-duplicate links which share
the same value, resulting in a compressed file. *)
val cache : 'a. (module Hashtbl.HashedType with type t = 'a) -> 'a link -> unit
Expand Down
35 changes: 30 additions & 5 deletions src/index-format/index_format.ml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@ module Uid_map = Granular_map.Make (Shape.Uid)
module Stats = Map.Make (String)
module Uid_set = Shape.Uid.Set

module Union_find = struct
type t = Uid_set.t Union_find.element Granular_marshal.link

let make v = Granular_marshal.link (Union_find.make v)

let get t = Union_find.get (Granular_marshal.fetch t)

let union a b =
Granular_marshal.(
link (Union_find.union ~f:Uid_set.union (fetch a) (fetch b)))
end

let add map uid locs =
Uid_map.update uid
(function
Expand All @@ -21,23 +33,36 @@ type index =
cu_shape : (string, Shape.t) Hashtbl.t;
stats : stat Stats.t;
root_directory : string option;
related_uids : Uid_set.t Union_find.element Uid_map.t
related_uids : Union_find.t Uid_map.t
}

let lidset_schema iter lidset = Lid_set.schema iter Lid.schema lidset

let index_schema (iter : Granular_marshal.iter) index =
Uid_map.schema iter (fun iter _ v -> lidset_schema iter v) index.defs;
Uid_map.schema iter (fun iter _ v -> lidset_schema iter v) index.approximated;
Uid_map.schema iter (fun _ _ _ -> ()) index.related_uids
Uid_map.schema iter
(fun iter _ v ->
iter.Granular_marshal.yield v Granular_marshal.schema_no_sublinks)
index.related_uids

let compress index =
let cache = Lid.cache () in
let compress_map_set =
Uid_map.iter (fun _ -> Lid_set.iter (Lid.deduplicate cache))
in
compress_map_set index.defs;
compress_map_set index.approximated
compress_map_set index.approximated;
let related_uids =
Uid_map.map
(fun set ->
let uid = Uid_set.min_elt (Union_find.get set) in
let reference_set = Uid_map.find uid index.related_uids in
Granular_marshal.reuse reference_set;
reference_set)
index.related_uids
in
{ index with related_uids }

let pp_lidset fmt locs =
Format.pp_print_list
Expand All @@ -54,7 +79,7 @@ let pp_partials (fmt : Format.formatter) (partials : Lid_set.t Uid_map.t) =
Format.fprintf fmt "@]}"

let pp_related_uids (fmt : Format.formatter)
(related_uids : Uid_set.t Union_find.element Uid_map.t) =
(related_uids : Union_find.t Uid_map.t) =
let rec gather acc map =
match Uid_map.choose_opt map with
| Some (_key, union) ->
Expand Down Expand Up @@ -93,7 +118,7 @@ let ext = "ocaml-index"
let magic_number = Config.index_magic_number

let write ~file index =
compress index;
let index = compress index in
Misc.output_to_file_via_temporary ~mode:[ Open_binary ] file
(fun _temp_file_name oc ->
output_string oc magic_number;
Expand Down
9 changes: 8 additions & 1 deletion src/index-format/index_format.mli
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@ module Lid_set : Granular_set.S with type elt = Lid.t
module Stats : Map.S with type key = String.t
module Uid_set = Shape.Uid.Set
module Uid_map : Granular_map.S with type key = Shape.Uid.t
module Union_find : sig
type t

val make : Uid_set.t -> t
val get : t -> Uid_set.t
val union : t -> t -> t
end

type stat = { mtime : float; size : int; source_digest : string option }

Expand All @@ -21,7 +28,7 @@ type index =
cu_shape : (string, Shape.t) Hashtbl.t;
stats : stat Stats.t;
root_directory : string option;
related_uids : Uid_set.t Union_find.element Uid_map.t
related_uids : Union_find.t Uid_map.t
}

val pp : Format.formatter -> index -> unit
Expand Down
5 changes: 2 additions & 3 deletions src/ocaml-index/lib/index.ml
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,7 @@ let index_of_cmt ~into ~root ~rewrite_root ~build_path ~do_not_use_cmt_loadpath
let map_update uid =
Uid_map.update uid (function
| None -> Some union
| Some union' ->
Some (Union_find.union ~f:Uid_set.union union' union))
| Some union' -> Some (Union_find.union union' union))
in
acc |> map_update uid1 |> map_update uid2)
into.related_uids cmt_declaration_dependencies
Expand All @@ -162,7 +161,7 @@ let merge_index ~store_shapes ~into index =
let stats = Stats.union (fun _ f1 _f2 -> Some f1) into.stats index.stats in
let related_uids =
Uid_map.union
(fun _ a b -> Some (Union_find.union ~f:Uid_set.union a b))
(fun _ a b -> Some (Union_find.union a b))
index.related_uids into.related_uids
in
if store_shapes then
Expand Down
54 changes: 54 additions & 0 deletions tests/test-dirs/occurrences/project-wide/union.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
Serialization of `related_uids` requires special cares as the union-find
algorithm relies on deserialization preserving physical identity (for mutations
to work). The issue manifested on sufficiently large indexes (if small, then
the marshal wouldn't be granular):

$ NB=1024
$ for i in $(seq 1 $NB); do echo "let x$i = 0"; done >test.ml
$ for i in $(seq 1 $NB); do echo "val x$i : int"; done >test.mli
$ $OCAMLC -bin-annot -bin-annot-occurrences -c test.mli test.ml

A signature containing the same symbols:

$ echo "module type S = sig\n$(cat test.mli)\nend" >sig.ml
$ echo "module type S = sig\n$(cat test.mli)\nend" >sig.mli
$ $OCAMLC -bin-annot -bin-annot-occurrences -c sig.mli sig.ml

At this point `ŧest` and `sig` are unrelated. We'll later force their unification with:

$ cat >both.ml <<EOF
> module M = (Test : Sig.S)
> EOF
$ $OCAMLC -bin-annot -bin-annot-occurrences -c both.ml

$ cat > .merlin << EOF
> INDEX project.ocaml-index
> SOURCE_ROOT .
> EOF

First compute the index for `test` and `sig`:

$ ocaml-index aggregate test.cmti test.cmt sig.cmti sig.cmt --root . --rewrite-root
$ mv project.ocaml-index test_sig.ocaml-index

Then for `both`:

$ ocaml-index aggregate both.cmt --root . --rewrite-root

Merge everything together, which reveals the relation between `test` and `sig` uids:

$ ocaml-index aggregate test_sig.ocaml-index project.ocaml-index

All files should be listed on queries: (except `both.ml`)

$ $MERLIN single occurrences -scope renaming -identifier-at 1:5 -filename test.ml < test.ml | jq '.value[] | .file'
"$TESTCASE_ROOT/test.ml"
"$TESTCASE_ROOT/sig.ml"
"$TESTCASE_ROOT/sig.mli"
"$TESTCASE_ROOT/test.mli"

$ $MERLIN single occurrences -scope renaming -identifier-at 50:5 -filename test.ml < test.ml | jq '.value[] | .file'
"$TESTCASE_ROOT/test.ml"
"$TESTCASE_ROOT/sig.ml"
"$TESTCASE_ROOT/sig.mli"
"$TESTCASE_ROOT/test.mli"

0 comments on commit 49a53ad

Please sign in to comment.