-
Notifications
You must be signed in to change notification settings - Fork 14
/
JASSjr_search.rb
executable file
·74 lines (56 loc) · 2.27 KB
/
JASSjr_search.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env ruby
# Copyright (c) 2024 Vaughan Kitchen
# Minimalistic BM25 search engine.
k1 = 0.9 # BM25 k1 parameter
b = 0.4 # BM25 b parameter
doc_ids = File.readlines("docids.bin", chomp: true) # Read the primary_keys
doc_lengths = File.binread("lengths.bin").unpack("l*") # Read the document lengths
average_length = doc_lengths.sum.to_f / doc_lengths.length
vocab = {}
vocab_raw = File.binread("vocab.bin")
offset = 0
# decode the vocabulary (unsigned byte length, string, '\0', 4 byte signed where, 4 signed byte size)
while offset < vocab_raw.length do
length = vocab_raw.unpack("C", offset: offset)[0]
offset += 1
term = vocab_raw[offset...offset+length]
offset += length + 1 # Null terminated
postings_pair = vocab_raw.unpack("ll", offset: offset)
offset += 8
vocab[term] = postings_pair
end
# Search (one query per line)
loop do
query = gets&.split
break if query.nil?
query_id = 0
accumulators = Hash.new(0)
# If the first token is a number then assume a TREC query number, and skip it
begin
query_id = Integer(query[0])
query.shift
rescue ArgumentError
end
query.each do |term|
offset, size = vocab[term]
next if offset.nil? # Does the term exist in the collection?
# Seek and read the postings list
postings = File.binread("postings.bin", size, offset).unpack("l*")
# Compute the IDF component of BM25 as log(N/n).
idf = Math.log(doc_ids.length.to_f / (postings.length / 2))
# Process the postings list by simply adding the BM25 component for this document into the accumulators array
postings.each_slice(2) do |docid, tf|
rsv = idf * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_lengths[docid] / average_length))))
accumulators[docid] += rsv
end
end
# Turn the accumulators back into an array to get a stable ordering
accumulators = accumulators.collect { |k, v| [v, k] }
# Sort the results list. Tie break on the document ID.
accumulators.sort! { |a, b| a[0] == b[0] ? b[1] <=> a[1] : b[0] <=> a[0] }
# Print the (at most) top 1000 documents in the results list in TREC eval format which is:
# query-id Q0 document-id rank score run-name
accumulators.take(1000).each_with_index do |(rsv, docid), i|
puts("#{query_id} Q0 #{doc_ids[docid]} #{i+1} #{'%.4f' % rsv} JASSjr")
end
end