Commit 4f63d3cc authored by Gijs Hendriksen's avatar Gijs Hendriksen

Add assertion that scores are equal across engines

parent 9b5c4bf3
...@@ -25,7 +25,7 @@ class Index(ABC): ...@@ -25,7 +25,7 @@ class Index(ABC):
for term in word_tokenize(body.lower()): for term in word_tokenize(body.lower()):
if term not in self.stopwords and term.isalpha(): if term not in self.stopwords and term.isalpha():
terms[term[:32]] += 1 terms[term] += 1
return terms return terms
...@@ -208,7 +208,8 @@ class DuckDBIndex(Index): ...@@ -208,7 +208,8 @@ class DuckDBIndex(Index):
def search(self, query): def search(self, query):
self.cursor.execute(query) self.cursor.execute(query)
return self.cursor.fetchdf() df = self.cursor.fetchdf()
return list(df.itertuples(index=False, name=None))[:10]
def clear(self): def clear(self):
self.cursor.execute("DELETE FROM terms") self.cursor.execute("DELETE FROM terms")
......
import argparse import argparse
import math
import time import time
from index import Index, DuckDBIndex, MonetDBIndex from index import Index, DuckDBIndex, MonetDBIndex
from search import Search from search import Search
...@@ -40,28 +41,34 @@ def benchmark(args: argparse.Namespace): ...@@ -40,28 +41,34 @@ def benchmark(args: argparse.Namespace):
iterations = 20 iterations = 20
scores = [[] for _ in range(len(indices))]
for filename in args.input: for filename in args.input:
benchmark_times = [] benchmark_times = []
print(f'Filename: "{filename}"') print(f'Filename: "{filename}"')
for index in indices: for i, index in enumerate(indices):
index.clear() index.clear()
print('Indexing...') print('Indexing...')
index.bulk_index(filename) index.bulk_index(filename)
search = Search(index)
times = [] times = []
for query in queries: for query in queries:
start = time.time() start = time.time()
for _ in range(iterations): for _ in range(iterations):
search = Search(index)
search.search(query) search.search(query)
end = time.time() end = time.time()
avg_time = (end - start) / iterations avg_time = (end - start) / iterations
times.append(f'{avg_time:.04}s') times.append(f'{avg_time:.4f}s')
# Compare the scores to verify both engines return the same results
scores[i].append(search.search(query))
benchmark_times.append(times) benchmark_times.append(times)
...@@ -74,6 +81,11 @@ def benchmark(args: argparse.Namespace): ...@@ -74,6 +81,11 @@ def benchmark(args: argparse.Namespace):
print() print()
for i in range(len(scores[0])):
for duck_scores, monet_scores in zip(scores[0][i], scores[1][i]):
assert duck_scores[0] == monet_scores[0], 'Retrieved documents are not equal!'
assert math.isclose(duck_scores[1], monet_scores[1], abs_tol=1e-2), f'Scores are unequal: {duck_scores[1]}, {monet_scores[1]}'
def dump_index(args: argparse.Namespace): def dump_index(args: argparse.Namespace):
index = Index.get_index(args.engine, args.database) index = Index.get_index(args.engine, args.database)
......
...@@ -16,7 +16,7 @@ def bm25(terms, disjunctive=True): ...@@ -16,7 +16,7 @@ def bm25(terms, disjunctive=True):
AS cdocs ON term_tf.docid = cdocs.docid AS cdocs ON term_tf.docid = cdocs.docid
JOIN docs ON term_tf.docid=docs.docid JOIN docs ON term_tf.docid=docs.docid
JOIN dict ON term_tf.termid=dict.termid) JOIN dict ON term_tf.termid=dict.termid)
SELECT scores.docid, score FROM (SELECT docid, sum(subscore) AS score SELECT docs.name, score FROM (SELECT docid, sum(subscore) AS score
FROM subscores GROUP BY docid) AS scores JOIN docs ON FROM subscores GROUP BY docid) AS scores JOIN docs ON
scores.docid=docs.docid ORDER BY score DESC; scores.docid=docs.docid ORDER BY score DESC;
""" """
...@@ -9,6 +9,7 @@ class Search: ...@@ -9,6 +9,7 @@ class Search:
self.index = index self.index = index
def search(self, terms, method='bm25'): def search(self, terms, method='bm25'):
terms = self.index.get_terms(' '.join(terms)).keys()
if method == 'bm25': if method == 'bm25':
sql_query = query.bm25(terms) sql_query = query.bm25(terms)
else: else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment