Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
Information Retrieval
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Gijs Hendriksen
Information Retrieval
Commits
4f63d3cc
Commit
4f63d3cc
authored
Dec 16, 2019
by
Gijs Hendriksen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add assertion that scores are equal across engines
parent
9b5c4bf3
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
20 additions
and
6 deletions
+20
-6
index.py
index.py
+3
-2
main.py
main.py
+15
-3
query.py
query.py
+1
-1
search.py
search.py
+1
-0
No files found.
index.py
View file @
4f63d3cc
...
...
@@ -25,7 +25,7 @@ class Index(ABC):
for
term
in
word_tokenize
(
body
.
lower
()):
if
term
not
in
self
.
stopwords
and
term
.
isalpha
():
terms
[
term
[:
32
]
]
+=
1
terms
[
term
]
+=
1
return
terms
...
...
@@ -208,7 +208,8 @@ class DuckDBIndex(Index):
def
search
(
self
,
query
):
self
.
cursor
.
execute
(
query
)
return
self
.
cursor
.
fetchdf
()
df
=
self
.
cursor
.
fetchdf
()
return
list
(
df
.
itertuples
(
index
=
False
,
name
=
None
))[:
10
]
def
clear
(
self
):
self
.
cursor
.
execute
(
"DELETE FROM terms"
)
...
...
main.py
View file @
4f63d3cc
import
argparse
import
math
import
time
from
index
import
Index
,
DuckDBIndex
,
MonetDBIndex
from
search
import
Search
...
...
@@ -40,28 +41,34 @@ def benchmark(args: argparse.Namespace):
iterations
=
20
scores
=
[[]
for
_
in
range
(
len
(
indices
))]
for
filename
in
args
.
input
:
benchmark_times
=
[]
print
(
f'Filename: "
{
filename
}
"'
)
for
i
ndex
in
indices
:
for
i
,
index
in
enumerate
(
indices
)
:
index
.
clear
()
print
(
'Indexing...'
)
index
.
bulk_index
(
filename
)
search
=
Search
(
index
)
times
=
[]
for
query
in
queries
:
start
=
time
.
time
()
for
_
in
range
(
iterations
):
search
=
Search
(
index
)
search
.
search
(
query
)
end
=
time
.
time
()
avg_time
=
(
end
-
start
)
/
iterations
times
.
append
(
f'
{
avg_time
:.
04
}
s'
)
times
.
append
(
f'
{
avg_time
:.
4
f
}
s'
)
# Compare the scores to verify both engines return the same results
scores
[
i
].
append
(
search
.
search
(
query
))
benchmark_times
.
append
(
times
)
...
...
@@ -74,6 +81,11 @@ def benchmark(args: argparse.Namespace):
print
()
for
i
in
range
(
len
(
scores
[
0
])):
for
duck_scores
,
monet_scores
in
zip
(
scores
[
0
][
i
],
scores
[
1
][
i
]):
assert
duck_scores
[
0
]
==
monet_scores
[
0
],
'Retrieved documents are not equal!'
assert
math
.
isclose
(
duck_scores
[
1
],
monet_scores
[
1
],
abs_tol
=
1e-2
),
f'Scores are unequal:
{
duck_scores
[
1
]
}
,
{
monet_scores
[
1
]
}
'
def
dump_index
(
args
:
argparse
.
Namespace
):
index
=
Index
.
get_index
(
args
.
engine
,
args
.
database
)
...
...
query.py
View file @
4f63d3cc
...
...
@@ -16,7 +16,7 @@ def bm25(terms, disjunctive=True):
AS cdocs ON term_tf.docid = cdocs.docid
JOIN docs ON term_tf.docid=docs.docid
JOIN dict ON term_tf.termid=dict.termid)
SELECT
scores.docid
, score FROM (SELECT docid, sum(subscore) AS score
SELECT
docs.name
, score FROM (SELECT docid, sum(subscore) AS score
FROM subscores GROUP BY docid) AS scores JOIN docs ON
scores.docid=docs.docid ORDER BY score DESC;
"""
search.py
View file @
4f63d3cc
...
...
@@ -9,6 +9,7 @@ class Search:
self
.
index
=
index
def
search
(
self
,
terms
,
method
=
'bm25'
):
terms
=
self
.
index
.
get_terms
(
' '
.
join
(
terms
)).
keys
()
if
method
==
'bm25'
:
sql_query
=
query
.
bm25
(
terms
)
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment