Skip to content

Commit

Permalink
added minimum number of postings to process and query-relative rho
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtrotman committed Nov 26, 2021
1 parent 246ea8a commit 973201d
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 22 deletions.
54 changes: 39 additions & 15 deletions anytime/JASS_anytime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,30 +20,36 @@
PARAMETERS
----------
*/
static double rho = 100.0; ///< In the anytime paper rho is the prcentage of the collection that should be used as a cap to the number of postings processed.
static double rho = 100.0; ///< In the anytime paper rho is the percentage of the collection that should be used as a cap to the number of postings processed.
static double rho_min = 0; ///< In the anytime paper rho is the percentage of the collection that should be used as a cap to the number of postings processed, this is the minimum to process.
static double parameter_relative_rho = 100.0; ///< Percentage of the all the postings for this query that should be processed
static size_t maximum_number_of_postings_to_process = 0; ///< Computed from rho
static std::string parameter_queryfilename; ///< Name of file containing the queries
static size_t minimum_number_of_postings_to_process = 0; ///< The minimum number of postings to process (to prevent "way too early" early termination
static std::string parameter_queryfilename; ///< Name of file containing the queries
static size_t parameter_threads = 1; ///< Number of concurrent queries
static size_t parameter_top_k = 10; ///< Number of results to return
static size_t parameter_top_k = 10; ///< Number of results to return
static size_t accumulator_width = 0; ///< The width (2^accumulator_width) of the accumulator 2-D array (if they are being used).
static bool parameter_ascii_query_parser = false; ///< When true use the ASCII pre-casefolded query parser
static bool parameter_help = false; ///< Print the usage information
static bool parameter_index_v2 = false; ///< The index is a JASS version 2 index
static bool parameter_help = false; ///< Print the usage information
static bool parameter_index_v2 = false; ///< The index is a JASS version 2 index
static std::string parameter_rsv_scores_filename; ///< The name of the file containing ordered pairs <query_id> <rsv> for the minimum rsv to be found

static std::string parameters_errors; ///< Any errors as a result of command line parsing
static auto parameters = std::make_tuple ///< The command line parameter block
(
JASS::commandline::parameter("-?", "--help", " Print this help.", parameter_help),
JASS::commandline::parameter("-2", "--v2_index", " The index is a JASS v2 index", parameter_index_v2),
JASS::commandline::parameter("-a", "--asciiparser ", " Use simple query parser (ASCII seperated pre-casefolded tokens)", parameter_ascii_query_parser),
JASS::commandline::parameter("-k", "--top-k", "<top-k> Number of results to return to the user (top-k value) [default = -k10]", parameter_top_k),
JASS::commandline::parameter("-q", "--queryfile", "<filename> Name of file containing a list of queries (1 per line, each line prefixed with query-id)", parameter_queryfilename),
JASS::commandline::parameter("-Q", "--queryrsvfile", "<filename> Name of file containing a list of the minimum rsv value for a document to be found (1 per line: <query_id> <rsv>)", parameter_rsv_scores_filename),
JASS::commandline::parameter("-r", "--rho", "<integer_percent> Percent of the collection size to use as max number of postings to process [default = -r100] (overrides -RHO)", rho),
JASS::commandline::parameter("-R", "--RHO", "<integer_max> Max number of postings to process [default is all] (overridden by -rho)", maximum_number_of_postings_to_process),
JASS::commandline::parameter("-t", "--threads", "<threadcount> Number of threads to use (one query per thread) [default = -t1]", parameter_threads),
JASS::commandline::parameter("-w", "--width", "<2^w> The width of the 2D accumulator array (2^w is used)", accumulator_width)
JASS::commandline::parameter("-?", "--help", " Print this help.", parameter_help),
JASS::commandline::parameter("-2", "--v2_index", " The index is a JASS v2 index", parameter_index_v2),
JASS::commandline::parameter("-a", "--asciiparser ", " Use simple query parser (ASCII seperated pre-casefolded tokens)", parameter_ascii_query_parser),
JASS::commandline::parameter("-k", "--top-k", "<top-k> Number of results to return to the user (top-k value) [default = -k10]", parameter_top_k),
JASS::commandline::parameter("-q", "--queryfile", "<filename> Name of file containing a list of queries (1 per line, each line prefixed with query-id)", parameter_queryfilename),
JASS::commandline::parameter("-Q", "--queryrsvfile", "<filename> Name of file containing a list of the minimum rsv value for a document to be found (1 per line: <query_id> <rsv>)", parameter_rsv_scores_filename),
JASS::commandline::parameter("-r", "--rho", "<integer_percent> Percent of the collection size to use as max number of postings to process [default = -r100] (overrides -R)", rho),
JASS::commandline::parameter("-⌊r⌋", "--rho_min", "<integer_percent> Percent of the collection size to use as minimum number of postings to process [default is 0] (overrides -R)", rho_min),
JASS::commandline::parameter("-R", "--RHO", "<integer_max> Max number of postings to process [default is all]", maximum_number_of_postings_to_process),
JASS::commandline::parameter("-⌊R⌋", "--RHO_min", "<integer_min> Minimum number of postings to process [default is 0]", minimum_number_of_postings_to_process),
JASS::commandline::parameter("-ℝ", "--Relative_RHO", "<integer_percent> Percent of this queries postings to use as max number of postings to process [default = -ℝ100] (overrides -R and -r)", parameter_relative_rho),
JASS::commandline::parameter("-t", "--threads", "<threadcount> Number of threads to use (one query per thread) [default = -t1]", parameter_threads),
JASS::commandline::parameter("-w", "--width", "<2^w> The width of the 2D accumulator array (2^w is used)", accumulator_width)
);

/*
Expand Down Expand Up @@ -199,12 +205,30 @@ static int main_event(int argc, const char *argv[])
std::cout << "Failure to set the number of postings to process to " << maximum_number_of_postings_to_process << '\n';
return 0;
}
if (minimum_number_of_postings_to_process != 0)
if (engine.set_postings_to_process_minimum(minimum_number_of_postings_to_process) != JASS_ERROR_OK)
{
std::cout << "Failure to set the minimum number of postings to process to " << minimum_number_of_postings_to_process << '\n';
return 0;
}
if (rho != 100.0)
if (engine.set_postings_to_process_proportion(rho) != JASS_ERROR_OK)
{
std::cout << "Failure to set the proportion of postings to process\n";
return 0;
}
if (rho_min != 0)
if (engine.set_postings_to_process_proportion_minimum(rho_min) != JASS_ERROR_OK)
{
std::cout << "Failure to set the minimum proportion of postings to process\n";
return 0;
}
if (parameter_relative_rho != 100)
if (engine.set_postings_to_process_relative(parameter_relative_rho) != JASS_ERROR_OK)
{
std::cout << "Failure to set the relative postings stopping condition\n";
return 0;
}

/*
Report the number of postings we're going to process
Expand Down
59 changes: 56 additions & 3 deletions anytime/JASS_anytime_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ JASS_anytime_api::JASS_anytime_api()
index = nullptr;
precomputed_minimum_rsv_table = new JASS::top_k_limit;
postings_to_process = (std::numeric_limits<size_t>::max)();
postings_to_process_min = 0;
relative_postings_to_process = 100;
top_k = 10;
which_query_parser = JASS::parser_query::parser_type::query;
accumulator_width = 0;
Expand Down Expand Up @@ -127,6 +129,31 @@ JASS_ERROR JASS_anytime_api::set_postings_to_process_proportion(double percent)
return JASS_ERROR_OK;
}

/*
JASS_ANYTIME_API::SET_POSTINGS_TO_PROCESS_PROPORTION_MINIMUM()
--------------------------------------------------------------
*/
JASS_ERROR JASS_anytime_api::set_postings_to_process_proportion_minimum(double percent)
{
if (index == nullptr)
return JASS_ERROR_NO_INDEX;

postings_to_process_min = (size_t)((double)index->document_count() * percent / 100.0);

return JASS_ERROR_OK;
}

/*
JASS_ANYTIME_API::SET_POSTINGS_TO_PROCESS_RELATIVE()
----------------------------------------------------
*/
JASS_ERROR JASS_anytime_api::set_postings_to_process_relative(double percent)
{
relative_postings_to_process = percent / 100.0;

return JASS_ERROR_OK;
}

/*
JASS_ANYTIME_API::SET_POSTINGS_TO_PROCESS()
-------------------------------------------
Expand All @@ -138,6 +165,18 @@ JASS_ERROR JASS_anytime_api::set_postings_to_process(size_t count)
return JASS_ERROR_OK;
}

/*
JASS_ANYTIME_API::SET_POSTINGS_TO_PROCESS_MINIMUM()
---------------------------------------------------
*/
JASS_ERROR JASS_anytime_api::set_postings_to_process_minimum(size_t count)
{
postings_to_process_min = count;

return JASS_ERROR_OK;
}


/*
JASS_ANYTIME_API::GET_POSTINGS_TO_PROCESS()
-------------------------------------------
Expand Down Expand Up @@ -396,6 +435,7 @@ void JASS_anytime_api::anytime(JASS_anytime_thread_result &output, const JASS::d
uint32_t largest_possible_rsv = (std::numeric_limits<decltype(largest_possible_rsv)>::min)();
uint32_t largest_possible_rsv_with_overflow;
uint32_t smallest_possible_rsv = (std::numeric_limits<decltype(smallest_possible_rsv)>::max)();
uint64_t total_postings_for_query = 0;
//std::cout << "\n";
for (const auto &term : jass_query->terms())
{
Expand All @@ -413,7 +453,9 @@ void JASS_anytime_api::anytime(JASS_anytime_thread_result &output, const JASS::d
*/
uint32_t term_smallest_impact;
uint32_t term_largest_impact;
current_segment += index.get_segment_list(current_segment, metadata, term.frequency(), term_smallest_impact, term_largest_impact);
JASS::query::DOCID_TYPE document_frequency;
current_segment += index.get_segment_list(current_segment, metadata, term.frequency(), term_smallest_impact, term_largest_impact, document_frequency);
total_postings_for_query += document_frequency;

/*
Compute the largest and smallest possible rsv values
Expand Down Expand Up @@ -460,9 +502,14 @@ void JASS_anytime_api::anytime(JASS_anytime_thread_result &output, const JASS::d
{
scale_rsv_scores = true;
smallest_possible_rsv = (uint32_t)((double)largest_possible_rsv / (double)largest_possible_rsv * (double)JASS::query::MAX_RSV);
// rsv_at_k = (JASS::query::ACCUMULATOR_TYPE)((double)rsv_at_k / (double)largest_possible_rsv * (double)JASS::query::MAX_RSV);
largest_possible_rsv = (uint32_t)((double)largest_possible_rsv / (double)largest_possible_rsv * (double)JASS::query::MAX_RSV);

/*
This line (commented out) re-scales the rsv_at_k value, which we need to do if that score comes
from some other search engine (which is unlikely to occur).
*/
// rsv_at_k = (JASS::query::ACCUMULATOR_TYPE)((double)rsv_at_k / (double)largest_possible_rsv * (double)JASS::query::MAX_RSV);

/*
Check for zeros
*/
Expand All @@ -473,6 +520,12 @@ void JASS_anytime_api::anytime(JASS_anytime_thread_result &output, const JASS::d
jass_query->rewind(smallest_possible_rsv, rsv_at_k, largest_possible_rsv);
//std::cout << "MAXRSV:" << largest_possible_rsv << " MINRSV:" << smallest_possible_rsv << "\n";

/*
Check to see if we've got a rho stopping conditio relative to the number of postings in this query.
*/
if (relative_postings_to_process != 1)
postings_to_process = total_postings_for_query * relative_postings_to_process;

/*
Process the segments
*/
Expand Down Expand Up @@ -501,7 +554,7 @@ void JASS_anytime_api::anytime(JASS_anytime_thread_result &output, const JASS::d
/*
Early terminate if we have filled the heap with documents having rsv scores higher than the rsv_at_k oracle score.
*/
if (rsv_at_k > 1 && jass_query->size() >= top_k)
if (rsv_at_k > 1 && jass_query->size() >= top_k && postings_processed >= postings_to_process_min)
break;
}
/*
Expand Down
46 changes: 46 additions & 0 deletions anytime/JASS_anytime_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class JASS_anytime_api
JASS::deserialised_jass_v1 *index; ///< The index
JASS::top_k_limit *precomputed_minimum_rsv_table; ///< Oracle scores (estimates of the rsv for the document at k)
size_t postings_to_process; ///< The maximunm number of postings to process
size_t postings_to_process_min; ///< Process at least this number of postings
double relative_postings_to_process; ///< If not 100 then then this is the proportion of this query's postings that should be processed
size_t top_k; ///< The number of documents we want in the results list
JASS::parser_query::parser_type which_query_parser; ///< Use the simple ASCII parser or the regular query parser
size_t accumulator_width; ///< Width of the accumulator array
Expand Down Expand Up @@ -205,6 +207,35 @@ class JASS_anytime_api
*/
JASS_ERROR set_postings_to_process_proportion(double percent);


/*
JASS_ANYTIME_API::SET_POSTINGS_TO_PROCESS_PROPORTION_MINIMUM()
--------------------------------------------------------------
*/
/*!
@brief Set the minimum number of postings to process as a proportion of the number of documents in the collection.
@details An index must be loaded before this method is called, if not it returns JASS_ERROR_NO_INDEX and has no effect
By default all postings are processed.
@param percent [in] The percent to use (for example, 10 is use 10% of the postings)
@return JASS_ERROR_OK or JASS_ERROR_NO_INDEX
*/
JASS_ERROR set_postings_to_process_proportion_minimum(double percent);

/*
JASS_ANYTIME_API::SET_POSTINGS_TO_PROCESS_RELATIVE()
----------------------------------------------------
*/
/*!
@brief Set the number of postings to process as a proportion of the number of postings for this query.
@details An index does not need to be loaded first.
This method takes precidence over set_postings_to_process() and set_postings_to_process_proportion().
By default all postings are processed.
@param percent [in] The percent to use (for example, 10 is use 10% of the postings)
@return JASS_ERROR_OK or JASS_ERROR_NO_INDEX
*/
JASS_ERROR set_postings_to_process_relative(double percent);


/*
JASS_ANYTIME_API::SET_POSTINGS_TO_PROCESS()
-------------------------------------------
Expand All @@ -219,6 +250,21 @@ class JASS_anytime_api
*/
JASS_ERROR set_postings_to_process(size_t count);


/*
JASS_ANYTIME_API::SET_POSTINGS_TO_PROCESS__MINIMUM()
----------------------------------------------------
*/
/*!
@brief Set the minimum number of postings to process as an absolute number.
@details An index does not need to be loaded first.
By default all postings are processed.
@param count [in] The minimum number of postings to process
@return JASS_ERROR_OK
*/
JASS_ERROR set_postings_to_process_minimum(size_t count);


/*
JASS_ANYTIME_API::GET_POSTINGS_TO_PROCESS()
-------------------------------------------
Expand Down
6 changes: 4 additions & 2 deletions source/deserialised_jass_v1.h
Original file line number Diff line number Diff line change
Expand Up @@ -328,18 +328,20 @@ namespace JASS
@param largest [out] The smallest impact score for this term
@return The number of segments extracted and added to the list
*/
virtual size_t get_segment_list(segment_header *segments, metadata &metadata, size_t term_frequency, uint32_t &smallest, uint32_t &largest) const
virtual size_t get_segment_list(segment_header *segments, metadata &metadata, size_t query_term_frequency, uint32_t &smallest, uint32_t &largest, query::DOCID_TYPE &document_frequency) const
{
document_frequency = 0;
segment_header *current_segment = segments;
for (uint64_t segment = 0; segment < metadata.impacts; segment++)
{
uint64_t *postings_list = (uint64_t *)metadata.offset;
segment_header_on_disk *next_segment_in_postings_list = (segment_header_on_disk *)(postings() + postings_list[segment]);

current_segment->impact = next_segment_in_postings_list->impact * term_frequency;
current_segment->impact = next_segment_in_postings_list->impact * query_term_frequency;
current_segment->offset = next_segment_in_postings_list->offset;
current_segment->end = next_segment_in_postings_list->end;
current_segment->segment_frequency = next_segment_in_postings_list->segment_frequency;
document_frequency += next_segment_in_postings_list->segment_frequency;

current_segment++;
}
Expand Down
6 changes: 4 additions & 2 deletions source/deserialised_jass_v2.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,9 @@ namespace JASS
@param largest [out] The smallest impact score for this term
@return The number of segments extracted and added to the list
*/
virtual size_t get_segment_list(segment_header *segments, metadata &metadata, size_t term_frequency, uint32_t &smallest, uint32_t &largest) const
virtual size_t get_segment_list(segment_header *segments, metadata &metadata, size_t query_term_frequency, uint32_t &smallest, uint32_t &largest, query::DOCID_TYPE &document_frequency) const
{
document_frequency = 0;
/*
Extract all the segments
*/
Expand All @@ -139,8 +140,9 @@ namespace JASS
compress_integer_variable_byte::decompress_into(&current_segment->end , segment_header_pointer);
compress_integer_variable_byte::decompress_into(&current_segment->segment_frequency, segment_header_pointer);
current_segment->offset += segment_header_pointer - postings(); //v2 index is relative to the segment header
current_segment->impact *= term_frequency;
current_segment->impact *= query_term_frequency;
current_segment->end += current_segment->offset; // V2 indexes store length rather than an end pointer
document_frequency += current_segment->segment_frequency;
current_segment++;
}

Expand Down

0 comments on commit 973201d

Please sign in to comment.