Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
Information Retrieval
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Gijs Hendriksen
Information Retrieval
Commits
f5572c0e
Commit
f5572c0e
authored
Nov 22, 2019
by
Gijs Hendriksen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Type hinting and better output formatting
parent
df3bb8ca
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
55 additions
and
28 deletions
+55
-28
index.py
index.py
+34
-22
main.py
main.py
+7
-5
poetry.lock
poetry.lock
+13
-1
pyproject.toml
pyproject.toml
+1
-0
No files found.
index.py
View file @
f5572c0e
...
@@ -2,14 +2,19 @@ import duckdb
...
@@ -2,14 +2,19 @@ import duckdb
import
json
import
json
import
os
import
os
from
collections
import
defaultdict
from
collections
import
defaultdict
from
nltk
import
corpus
from
typing
import
List
class
Index
:
class
Index
:
def
__init__
(
self
,
db
):
cursor
:
duckdb
.
Cursor
stopwords
:
List
[
str
]
def
__init__
(
self
,
db
:
str
,
stopwords
:
List
[
str
]
=
None
):
db_exists
=
os
.
path
.
exists
(
db
)
db_exists
=
os
.
path
.
exists
(
db
)
self
.
cursor
=
duckdb
.
connect
(
db
).
cursor
()
self
.
cursor
=
duckdb
.
connect
(
db
).
cursor
()
self
.
stopwords
=
[
'i'
,
'on'
,
'my'
,
'and'
,
'in'
,
'the'
]
self
.
stopwords
=
stopwords
or
corpus
.
stopwords
.
words
(
'english'
)
if
db_exists
:
if
db_exists
:
self
.
reset_auto_increment
()
self
.
reset_auto_increment
()
...
@@ -44,7 +49,7 @@ class Index:
...
@@ -44,7 +49,7 @@ class Index:
'docid INTEGER NOT NULL,'
'docid INTEGER NOT NULL,'
'count INTEGER NOT NULL)'
)
'count INTEGER NOT NULL)'
)
def
get_terms
(
self
,
body
):
def
get_terms
(
self
,
body
:
str
):
terms
=
defaultdict
(
int
)
terms
=
defaultdict
(
int
)
for
term
in
body
.
lower
().
split
():
for
term
in
body
.
lower
().
split
():
...
@@ -53,7 +58,7 @@ class Index:
...
@@ -53,7 +58,7 @@ class Index:
return
terms
return
terms
def
index
(
self
,
document
):
def
index
(
self
,
document
:
dict
):
terms
=
self
.
get_terms
(
document
[
'body'
])
terms
=
self
.
get_terms
(
document
[
'body'
])
doc_name
=
document
[
'name'
]
doc_name
=
document
[
'name'
]
doc_length
=
len
(
terms
)
doc_length
=
len
(
terms
)
...
@@ -73,9 +78,13 @@ class Index:
...
@@ -73,9 +78,13 @@ class Index:
self
.
cursor
.
execute
(
f"INSERT INTO terms VALUES (
{
term_id
}
,
{
doc_id
}
,
{
frequency
}
)"
)
self
.
cursor
.
execute
(
f"INSERT INTO terms VALUES (
{
term_id
}
,
{
doc_id
}
,
{
frequency
}
)"
)
def
bulk_index
(
self
,
filename
):
def
bulk_index
(
self
,
filename
:
str
):
with
open
(
filename
)
as
_file
:
try
:
data
=
json
.
load
(
_file
)
with
open
(
filename
)
as
_file
:
data
=
json
.
load
(
_file
)
except
json
.
JSONDecodeError
:
print
(
'[!] Invalid input file!'
)
return
for
document
in
data
:
for
document
in
data
:
self
.
index
(
document
)
self
.
index
(
document
)
...
@@ -85,18 +94,21 @@ class Index:
...
@@ -85,18 +94,21 @@ class Index:
self
.
cursor
.
execute
(
"DELETE FROM docs"
)
self
.
cursor
.
execute
(
"DELETE FROM docs"
)
self
.
cursor
.
execute
(
"DELETE FROM dict"
)
self
.
cursor
.
execute
(
"DELETE FROM dict"
)
def
print_index
(
self
):
def
__str__
(
self
):
print
(
'dict'
)
dict_rows
=
self
.
cursor
.
execute
(
'SELECT * FROM dict'
).
fetchdf
()
dict_rows
=
self
.
cursor
.
execute
(
'SELECT * FROM dict'
).
fetchall
()
term_rows
=
self
.
cursor
.
execute
(
'SELECT * FROM terms'
).
fetchdf
()
for
row
in
dict_rows
:
doc_rows
=
self
.
cursor
.
execute
(
'SELECT * FROM docs'
).
fetchdf
()
print
(
'
\t
'
.
join
(
map
(
str
,
row
)))
return
'
\n
'
.
join
([
print
(
'terms'
)
'dict'
,
term_rows
=
self
.
cursor
.
execute
(
'SELECT * FROM terms'
).
fetchall
()
'-'
*
20
,
for
row
in
term_rows
:
str
(
dict_rows
),
print
(
'
\t
'
.
join
(
map
(
str
,
row
)))
''
,
'terms'
,
print
(
'docs'
)
'-'
*
20
,
doc_rows
=
self
.
cursor
.
execute
(
'SELECT * FROM docs'
).
fetchall
()
str
(
term_rows
),
for
row
in
doc_rows
:
''
,
print
(
'
\t
'
.
join
(
map
(
str
,
row
)))
'docs'
,
'-'
*
20
,
str
(
doc_rows
),
])
main.py
View file @
f5572c0e
from
argparse
import
ArgumentParser
from
argparse
import
ArgumentParser
import
argparse
from
index
import
Index
from
index
import
Index
def
bulk_index
(
index
,
args
):
def
bulk_index
(
index
:
Index
,
args
:
argparse
.
Namespace
):
filename
=
args
.
data
filename
=
args
.
data
index
.
bulk_index
(
filename
)
index
.
bulk_index
(
filename
)
index
.
print_index
(
)
print
(
index
)
def
query_index
(
index
,
args
):
def
query_index
(
index
:
Index
,
args
:
argparse
.
Namespace
):
query_terms
=
args
.
terms
query_terms
=
args
.
terms
# TODO use query terms to query index
# TODO use query terms to query index
def
clear_index
(
index
,
args
):
def
clear_index
(
index
:
Index
,
args
:
argparse
.
Namespace
):
index
.
clear
()
index
.
clear
()
def
main
():
def
main
():
parser
=
ArgumentParser
(
prog
=
'old_duck'
,
description
=
'OldDuck - A Python implementation of OldDog, using DuckDB'
)
parser
=
argparse
.
ArgumentParser
(
prog
=
'old_duck'
,
description
=
'OldDuck - A Python implementation of OldDog, using DuckDB'
)
subparsers
=
parser
.
add_subparsers
()
subparsers
=
parser
.
add_subparsers
()
...
...
poetry.lock
View file @
f5572c0e
...
@@ -10,6 +10,17 @@ version = "0.1.1"
...
@@ -10,6 +10,17 @@ version = "0.1.1"
numpy = ">=1.14"
numpy = ">=1.14"
pandas = ">=0.23"
pandas = ">=0.23"
[[package]]
category = "main"
description = "Natural Language Toolkit"
name = "nltk"
optional = false
python-versions = "*"
version = "3.4.5"
[package.dependencies]
six = "*"
[[package]]
[[package]]
category = "main"
category = "main"
description = "NumPy is the fundamental package for array computing with Python."
description = "NumPy is the fundamental package for array computing with Python."
...
@@ -59,11 +70,12 @@ python-versions = ">=2.6, !=3.0.*, !=3.1.*"
...
@@ -59,11 +70,12 @@ python-versions = ">=2.6, !=3.0.*, !=3.1.*"
version = "1.13.0"
version = "1.13.0"
[metadata]
[metadata]
content-hash = "
f1f0c6ddf09ca82467ae759f4c23a8485d3efaebe06d3a8a1f1e223ef9726a54
"
content-hash = "
7ef45e999a2464d5a8aeb70a370dbd0e2806b146e945e1ee679294a69d61c430
"
python-versions = "^3.6"
python-versions = "^3.6"
[metadata.hashes]
[metadata.hashes]
duckdb = ["2272cc1f8a6496b1e1c38dac09d5de054bb5ec668a8767166f7b0254f7d61275", "2273dd14dc8a5a97da07d088a61dafc4cb647b97366ef4b16fd6e25ebc8f049b", "357c6de793c419ac0890f9eb019b386a799407f7a34451e4318035526944ea68", "3d52673dcc53c37e00a6dd225725bc9117ae89f1c1c998438295e8086eab617e", "439211f22b106e242b33143e882c68b0a12af4b4d99e8a6f2c89719f189aab91", "43aee1ae234dd84e0ec632c352dc25cd0caa7b3950ffdfb7886f0ff9255c7e7d", "77d22b198a72d782281aa40156e737be566c007426de26433ea00edd67fb6c05", "88a659064b1135bb4b3fc0c23638ce2544458fea82967485f4f5dd9d1013f699", "93dc01f26980fc620e9d25184502fee351b462722e99cc910bb73e2e60b690e4", "d31eda4dddb9ba8b9d9bf83ead74d82c069d76617666d57c05c782cab5ddd59b", "ec9cecb93394617412b13cd782e8f3bf22d8be07d271fd78c49d233c33c5fb5a"]
duckdb = ["2272cc1f8a6496b1e1c38dac09d5de054bb5ec668a8767166f7b0254f7d61275", "2273dd14dc8a5a97da07d088a61dafc4cb647b97366ef4b16fd6e25ebc8f049b", "357c6de793c419ac0890f9eb019b386a799407f7a34451e4318035526944ea68", "3d52673dcc53c37e00a6dd225725bc9117ae89f1c1c998438295e8086eab617e", "439211f22b106e242b33143e882c68b0a12af4b4d99e8a6f2c89719f189aab91", "43aee1ae234dd84e0ec632c352dc25cd0caa7b3950ffdfb7886f0ff9255c7e7d", "77d22b198a72d782281aa40156e737be566c007426de26433ea00edd67fb6c05", "88a659064b1135bb4b3fc0c23638ce2544458fea82967485f4f5dd9d1013f699", "93dc01f26980fc620e9d25184502fee351b462722e99cc910bb73e2e60b690e4", "d31eda4dddb9ba8b9d9bf83ead74d82c069d76617666d57c05c782cab5ddd59b", "ec9cecb93394617412b13cd782e8f3bf22d8be07d271fd78c49d233c33c5fb5a"]
nltk = ["a08bdb4b8a1c13de16743068d9eb61c8c71c2e5d642e8e08205c528035843f82", "bed45551259aa2101381bbdd5df37d44ca2669c5c3dad72439fa459b29137d94"]
numpy = ["0b0dd8f47fb177d00fa6ef2d58783c4f41ad3126b139c91dd2f7c4b3fdf5e9a5", "25ffe71f96878e1da7e014467e19e7db90ae7d4e12affbc73101bcf61785214e", "26efd7f7d755e6ca966a5c0ac5a930a87dbbaab1c51716ac26a38f42ecc9bc4b", "28b1180c758abf34a5c3fea76fcee66a87def1656724c42bb14a6f9717a5bdf7", "2e418f0a59473dac424f888dd57e85f77502a593b207809211c76e5396ae4f5c", "30c84e3a62cfcb9e3066f25226e131451312a044f1fe2040e69ce792cb7de418", "4650d94bb9c947151737ee022b934b7d9a845a7c76e476f3e460f09a0c8c6f39", "4dd830a11e8724c9c9379feed1d1be43113f8bcce55f47ea7186d3946769ce26", "4f2a2b279efde194877aff1f76cf61c68e840db242a5c7169f1ff0fd59a2b1e2", "62d22566b3e3428dfc9ec972014c38ed9a4db4f8969c78f5414012ccd80a149e", "669795516d62f38845c7033679c648903200980d68935baaa17ac5c7ae03ae0c", "75fcd60d682db3e1f8fbe2b8b0c6761937ad56d01c1dc73edf4ef2748d5b6bc4", "9395b0a41e8b7e9a284e3be7060db9d14ad80273841c952c83a5afc241d2bd98", "9e37c35fc4e9410093b04a77d11a34c64bf658565e30df7cbe882056088a91c1", "a0678793096205a4d784bd99f32803ba8100f639cf3b932dc63b21621390ea7e", "b46554ad4dafb2927f88de5a1d207398c5385edbb5c84d30b3ef187c4a3894d8", "c867eeccd934920a800f65c6068acdd6b87e80d45cd8c8beefff783b23cdc462", "dd0667f5be56fb1b570154c2c0516a528e02d50da121bbbb2cbb0b6f87f59bc2", "de2b1c20494bdf47f0160bd88ed05f5e48ae5dc336b8de7cfade71abcc95c0b9", "f1df7b2b7740dd777571c732f98adb5aad5450aee32772f1b39249c8a50386f6", "ffca69e29079f7880c5392bf675eb8b4146479d976ae1924d01cd92b04cccbcc"]
numpy = ["0b0dd8f47fb177d00fa6ef2d58783c4f41ad3126b139c91dd2f7c4b3fdf5e9a5", "25ffe71f96878e1da7e014467e19e7db90ae7d4e12affbc73101bcf61785214e", "26efd7f7d755e6ca966a5c0ac5a930a87dbbaab1c51716ac26a38f42ecc9bc4b", "28b1180c758abf34a5c3fea76fcee66a87def1656724c42bb14a6f9717a5bdf7", "2e418f0a59473dac424f888dd57e85f77502a593b207809211c76e5396ae4f5c", "30c84e3a62cfcb9e3066f25226e131451312a044f1fe2040e69ce792cb7de418", "4650d94bb9c947151737ee022b934b7d9a845a7c76e476f3e460f09a0c8c6f39", "4dd830a11e8724c9c9379feed1d1be43113f8bcce55f47ea7186d3946769ce26", "4f2a2b279efde194877aff1f76cf61c68e840db242a5c7169f1ff0fd59a2b1e2", "62d22566b3e3428dfc9ec972014c38ed9a4db4f8969c78f5414012ccd80a149e", "669795516d62f38845c7033679c648903200980d68935baaa17ac5c7ae03ae0c", "75fcd60d682db3e1f8fbe2b8b0c6761937ad56d01c1dc73edf4ef2748d5b6bc4", "9395b0a41e8b7e9a284e3be7060db9d14ad80273841c952c83a5afc241d2bd98", "9e37c35fc4e9410093b04a77d11a34c64bf658565e30df7cbe882056088a91c1", "a0678793096205a4d784bd99f32803ba8100f639cf3b932dc63b21621390ea7e", "b46554ad4dafb2927f88de5a1d207398c5385edbb5c84d30b3ef187c4a3894d8", "c867eeccd934920a800f65c6068acdd6b87e80d45cd8c8beefff783b23cdc462", "dd0667f5be56fb1b570154c2c0516a528e02d50da121bbbb2cbb0b6f87f59bc2", "de2b1c20494bdf47f0160bd88ed05f5e48ae5dc336b8de7cfade71abcc95c0b9", "f1df7b2b7740dd777571c732f98adb5aad5450aee32772f1b39249c8a50386f6", "ffca69e29079f7880c5392bf675eb8b4146479d976ae1924d01cd92b04cccbcc"]
pandas = ["00dff3a8e337f5ed7ad295d98a31821d3d0fe7792da82d78d7fd79b89c03ea9d", "22361b1597c8c2ffd697aa9bf85423afa9e1fcfa6b1ea821054a244d5f24d75e", "255920e63850dc512ce356233081098554d641ba99c3767dde9e9f35630f994b", "26382aab9c119735908d94d2c5c08020a4a0a82969b7e5eefb92f902b3b30ad7", "33970f4cacdd9a0ddb8f21e151bfb9f178afb7c36eb7c25b9094c02876f385c2", "4545467a637e0e1393f7d05d61dace89689ad6d6f66f267f86fff737b702cce9", "52da74df8a9c9a103af0a72c9d5fdc8e0183a90884278db7f386b5692a2220a4", "61741f5aeb252f39c3031d11405305b6d10ce663c53bc3112705d7ad66c013d0", "6a3ac2c87e4e32a969921d1428525f09462770c349147aa8e9ab95f88c71ec71", "7458c48e3d15b8aaa7d575be60e1e4dd70348efcd9376656b72fecd55c59a4c3", "78bf638993219311377ce9836b3dc05f627a666d0dbc8cec37c0ff3c9ada673b", "8153705d6545fd9eb6dd2bc79301bff08825d2e2f716d5dced48daafc2d0b81f", "975c461accd14e89d71772e89108a050fa824c0b87a67d34cedf245f6681fc17", "9962957a27bfb70ab64103d0a7b42fa59c642fb4ed4cb75d0227b7bb9228535d", "adc3d3a3f9e59a38d923e90e20c4922fc62d1e5a03d083440468c6d8f3f1ae0a", "bbe3eb765a0b1e578833d243e2814b60c825b7fdbf4cdfe8e8aae8a08ed56ecf", "df8864824b1fe488cf778c3650ee59c3a0d8f42e53707de167ba6b4f7d35f133", "e45055c30a608076e31a9fcd780a956ed3b1fa20db61561b8d88b79259f526f7", "ee50c2142cdcf41995655d499a157d0a812fce55c97d9aad13bc1eef837ed36c"]
pandas = ["00dff3a8e337f5ed7ad295d98a31821d3d0fe7792da82d78d7fd79b89c03ea9d", "22361b1597c8c2ffd697aa9bf85423afa9e1fcfa6b1ea821054a244d5f24d75e", "255920e63850dc512ce356233081098554d641ba99c3767dde9e9f35630f994b", "26382aab9c119735908d94d2c5c08020a4a0a82969b7e5eefb92f902b3b30ad7", "33970f4cacdd9a0ddb8f21e151bfb9f178afb7c36eb7c25b9094c02876f385c2", "4545467a637e0e1393f7d05d61dace89689ad6d6f66f267f86fff737b702cce9", "52da74df8a9c9a103af0a72c9d5fdc8e0183a90884278db7f386b5692a2220a4", "61741f5aeb252f39c3031d11405305b6d10ce663c53bc3112705d7ad66c013d0", "6a3ac2c87e4e32a969921d1428525f09462770c349147aa8e9ab95f88c71ec71", "7458c48e3d15b8aaa7d575be60e1e4dd70348efcd9376656b72fecd55c59a4c3", "78bf638993219311377ce9836b3dc05f627a666d0dbc8cec37c0ff3c9ada673b", "8153705d6545fd9eb6dd2bc79301bff08825d2e2f716d5dced48daafc2d0b81f", "975c461accd14e89d71772e89108a050fa824c0b87a67d34cedf245f6681fc17", "9962957a27bfb70ab64103d0a7b42fa59c642fb4ed4cb75d0227b7bb9228535d", "adc3d3a3f9e59a38d923e90e20c4922fc62d1e5a03d083440468c6d8f3f1ae0a", "bbe3eb765a0b1e578833d243e2814b60c825b7fdbf4cdfe8e8aae8a08ed56ecf", "df8864824b1fe488cf778c3650ee59c3a0d8f42e53707de167ba6b4f7d35f133", "e45055c30a608076e31a9fcd780a956ed3b1fa20db61561b8d88b79259f526f7", "ee50c2142cdcf41995655d499a157d0a812fce55c97d9aad13bc1eef837ed36c"]
python-dateutil = ["73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", "75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"]
python-dateutil = ["73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", "75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"]
...
...
pyproject.toml
View file @
f5572c0e
...
@@ -7,6 +7,7 @@ authors = ["Your Name <you@example.com>"]
...
@@ -7,6 +7,7 @@ authors = ["Your Name <you@example.com>"]
[tool.poetry.dependencies]
[tool.poetry.dependencies]
python
=
"^3.6"
python
=
"^3.6"
duckdb
=
"^0.1.1"
duckdb
=
"^0.1.1"
nltk
=
"^3.4"
[tool.poetry.dev-dependencies]
[tool.poetry.dev-dependencies]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment