Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Gijs Hendriksen
Information Retrieval
Commits
402b2c93
Commit
402b2c93
authored
Nov 13, 2019
by
Gijs Hendriksen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Initial commit
parents
Pipeline
#33188
failed with stages
in 57 seconds
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
156 additions
and
0 deletions
+156
-0
.gitignore
.gitignore
+3
-0
index.py
index.py
+67
-0
poetry.lock
poetry.lock
+71
-0
pyproject.toml
pyproject.toml
+15
-0
No files found.
.gitignore
0 → 100644
View file @
402b2c93
.idea/
*.db
*.db.wal
index.py
0 → 100644
View file @
402b2c93
import
duckdb
import
os
class
Index
:
def
__init__
(
self
,
db
):
self
.
cursor
=
duckdb
.
connect
(
db
).
cursor
()
self
.
stopwords
=
[]
if
os
.
path
.
exists
(
db
):
os
.
remove
(
db
)
self
.
create_table
()
# def init_db(self):
# max_termid = self.cursor.execute('SELECT MAX(termid) + 1 FROM terms').fetchone()[0]
# max_termid = 1 if max_termid is None else max_termid[0]
#
# max_docid = self.cursor.execute('SELECT MAX(docid) + 1 FROM docs').fetchone()[0]
# max_docid = 1 if max_docid is None else max_docid[0]
#
# self.cursor.execute()
# self.cursor.execute(f'CREATE SEQUENCE term_ids START WITH {max_termid}')
# self.cursor.execute(f'CREATE SEQUENCE doc_ids START WITH ({max_docid})')
def
create_table
(
self
):
self
.
cursor
.
execute
(
f
'CREATE SEQUENCE term_ids'
)
self
.
cursor
.
execute
(
f
'CREATE SEQUENCE doc_ids'
)
self
.
cursor
.
execute
(
'CREATE TABLE dict('
'termid INTEGER NOT NULL,'
'term VARCHAR NOT NULL,'
'df INTEGER NOT NULL,'
'PRIMARY KEY (termid))'
)
self
.
cursor
.
execute
(
'CREATE TABLE docs('
'docid INTEGER NOT NULL,'
'name VARCHAR NOT NULL,'
'length INTEGER NOT NULL,'
'PRIMARY KEY (docid))'
)
self
.
cursor
.
execute
(
'CREATE TABLE terms('
'termid INTEGER NOT NULL,'
'docid INTEGER NOT NULL,'
'count INTEGER NOT NULL)'
)
def
get_terms
(
self
,
body
):
terms
=
body
.
split
()
return
[
term
.
lower
()
for
term
in
terms
if
term
.
lower
()
not
in
self
.
stopwords
]
def
index
(
self
,
document
):
terms
=
self
.
get_terms
(
document
[
'body'
])
for
term
in
terms
:
term_id
=
self
.
cursor
.
execute
(
f
"SELECT termid FROM dict WHERE term = '
{
term
}
'"
).
fetchone
()
print
(
term_id
)
if
term_id
is
None
:
self
.
cursor
.
execute
(
f
"INSERT INTO dict VALUES (nextval('term_ids'), '
{
term
}
', 1)"
)
else
:
self
.
cursor
.
execute
(
f
"UPDATE dict SET df = df + 1 WHERE termid =
{
term_id
[
0
]
}
"
)
print
(
self
.
cursor
.
execute
(
'SELECT * FROM dict'
).
fetchall
())
index
=
Index
(
'docs.db'
)
index
.
index
({
'body'
:
'I I put on my robe and wizard hat'
,
'name'
:
'doc1'
,
})
poetry.lock
0 → 100644
View file @
402b2c93
[[package]]
category = "main"
description = "DuckDB embedded database"
name = "duckdb"
optional = false
python-versions = "*"
version = "0.1.1"
[package.dependencies]
numpy = ">=1.14"
pandas = ">=0.23"
[[package]]
category = "main"
description = "NumPy is the fundamental package for array computing with Python."
name = "numpy"
optional = false
python-versions = ">=3.5"
version = "1.17.3"
[[package]]
category = "main"
description = "Powerful data structures for data analysis, time series, and statistics"
name = "pandas"
optional = false
python-versions = ">=3.5.3"
version = "0.25.3"
[package.dependencies]
numpy = ">=1.13.3"
python-dateutil = ">=2.6.1"
pytz = ">=2017.2"
[[package]]
category = "main"
description = "Extensions to the standard Python datetime module"
name = "python-dateutil"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
version = "2.8.1"
[package.dependencies]
six = ">=1.5"
[[package]]
category = "main"
description = "World timezone definitions, modern and historical"
name = "pytz"
optional = false
python-versions = "*"
version = "2019.3"
[[package]]
category = "main"
description = "Python 2 and 3 compatibility utilities"
name = "six"
optional = false
python-versions = ">=2.6, !=3.0.*, !=3.1.*"
version = "1.13.0"
[metadata]
content-hash = "f1f0c6ddf09ca82467ae759f4c23a8485d3efaebe06d3a8a1f1e223ef9726a54"
python-versions = "^3.6"
[metadata.hashes]
duckdb = ["2272cc1f8a6496b1e1c38dac09d5de054bb5ec668a8767166f7b0254f7d61275", "2273dd14dc8a5a97da07d088a61dafc4cb647b97366ef4b16fd6e25ebc8f049b", "357c6de793c419ac0890f9eb019b386a799407f7a34451e4318035526944ea68", "3d52673dcc53c37e00a6dd225725bc9117ae89f1c1c998438295e8086eab617e", "439211f22b106e242b33143e882c68b0a12af4b4d99e8a6f2c89719f189aab91", "43aee1ae234dd84e0ec632c352dc25cd0caa7b3950ffdfb7886f0ff9255c7e7d", "77d22b198a72d782281aa40156e737be566c007426de26433ea00edd67fb6c05", "88a659064b1135bb4b3fc0c23638ce2544458fea82967485f4f5dd9d1013f699", "93dc01f26980fc620e9d25184502fee351b462722e99cc910bb73e2e60b690e4", "d31eda4dddb9ba8b9d9bf83ead74d82c069d76617666d57c05c782cab5ddd59b", "ec9cecb93394617412b13cd782e8f3bf22d8be07d271fd78c49d233c33c5fb5a"]
numpy = ["0b0dd8f47fb177d00fa6ef2d58783c4f41ad3126b139c91dd2f7c4b3fdf5e9a5", "25ffe71f96878e1da7e014467e19e7db90ae7d4e12affbc73101bcf61785214e", "26efd7f7d755e6ca966a5c0ac5a930a87dbbaab1c51716ac26a38f42ecc9bc4b", "28b1180c758abf34a5c3fea76fcee66a87def1656724c42bb14a6f9717a5bdf7", "2e418f0a59473dac424f888dd57e85f77502a593b207809211c76e5396ae4f5c", "30c84e3a62cfcb9e3066f25226e131451312a044f1fe2040e69ce792cb7de418", "4650d94bb9c947151737ee022b934b7d9a845a7c76e476f3e460f09a0c8c6f39", "4dd830a11e8724c9c9379feed1d1be43113f8bcce55f47ea7186d3946769ce26", "4f2a2b279efde194877aff1f76cf61c68e840db242a5c7169f1ff0fd59a2b1e2", "62d22566b3e3428dfc9ec972014c38ed9a4db4f8969c78f5414012ccd80a149e", "669795516d62f38845c7033679c648903200980d68935baaa17ac5c7ae03ae0c", "75fcd60d682db3e1f8fbe2b8b0c6761937ad56d01c1dc73edf4ef2748d5b6bc4", "9395b0a41e8b7e9a284e3be7060db9d14ad80273841c952c83a5afc241d2bd98", "9e37c35fc4e9410093b04a77d11a34c64bf658565e30df7cbe882056088a91c1", "a0678793096205a4d784bd99f32803ba8100f639cf3b932dc63b21621390ea7e", "b46554ad4dafb2927f88de5a1d207398c5385edbb5c84d30b3ef187c4a3894d8", "c867eeccd934920a800f65c6068acdd6b87e80d45cd8c8beefff783b23cdc462", "dd0667f5be56fb1b570154c2c0516a528e02d50da121bbbb2cbb0b6f87f59bc2", "de2b1c20494bdf47f0160bd88ed05f5e48ae5dc336b8de7cfade71abcc95c0b9", "f1df7b2b7740dd777571c732f98adb5aad5450aee32772f1b39249c8a50386f6", "ffca69e29079f7880c5392bf675eb8b4146479d976ae1924d01cd92b04cccbcc"]
pandas = ["00dff3a8e337f5ed7ad295d98a31821d3d0fe7792da82d78d7fd79b89c03ea9d", "22361b1597c8c2ffd697aa9bf85423afa9e1fcfa6b1ea821054a244d5f24d75e", "255920e63850dc512ce356233081098554d641ba99c3767dde9e9f35630f994b", "26382aab9c119735908d94d2c5c08020a4a0a82969b7e5eefb92f902b3b30ad7", "33970f4cacdd9a0ddb8f21e151bfb9f178afb7c36eb7c25b9094c02876f385c2", "4545467a637e0e1393f7d05d61dace89689ad6d6f66f267f86fff737b702cce9", "52da74df8a9c9a103af0a72c9d5fdc8e0183a90884278db7f386b5692a2220a4", "61741f5aeb252f39c3031d11405305b6d10ce663c53bc3112705d7ad66c013d0", "6a3ac2c87e4e32a969921d1428525f09462770c349147aa8e9ab95f88c71ec71", "7458c48e3d15b8aaa7d575be60e1e4dd70348efcd9376656b72fecd55c59a4c3", "78bf638993219311377ce9836b3dc05f627a666d0dbc8cec37c0ff3c9ada673b", "8153705d6545fd9eb6dd2bc79301bff08825d2e2f716d5dced48daafc2d0b81f", "975c461accd14e89d71772e89108a050fa824c0b87a67d34cedf245f6681fc17", "9962957a27bfb70ab64103d0a7b42fa59c642fb4ed4cb75d0227b7bb9228535d", "adc3d3a3f9e59a38d923e90e20c4922fc62d1e5a03d083440468c6d8f3f1ae0a", "bbe3eb765a0b1e578833d243e2814b60c825b7fdbf4cdfe8e8aae8a08ed56ecf", "df8864824b1fe488cf778c3650ee59c3a0d8f42e53707de167ba6b4f7d35f133", "e45055c30a608076e31a9fcd780a956ed3b1fa20db61561b8d88b79259f526f7", "ee50c2142cdcf41995655d499a157d0a812fce55c97d9aad13bc1eef837ed36c"]
python-dateutil = ["73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", "75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"]
pytz = ["1c557d7d0e871de1f5ccd5833f60fb2550652da6be2693c1e02300743d21500d", "b02c06db6cf09c12dd25137e563b31700d3b80fcc4ad23abb7a315f2789819be"]
six = ["1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd", "30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66"]
pyproject.toml
0 → 100644
View file @
402b2c93
[tool.poetry]
name
=
"OldDuck"
version
=
"0.1.0"
description
=
"Teaching old ducks new tricks; a python implementation of OldDog, a column store based IR system, using DuckDB"
authors
=
[
"Your Name <you@example.com>"
]
[tool.poetry.dependencies]
python
=
"^3.6"
duckdb
=
"^0.1.1"
[tool.poetry.dev-dependencies]
[build-system]
requires
=
["poetry>=0.12"]
build-backend
=
"poetry.masonry.api"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment