Commit 402b2c93 authored by Gijs Hendriksen's avatar Gijs Hendriksen

Initial commit

parents
Pipeline #33188 failed with stages
in 57 seconds
.idea/
*.db
*.db.wal
import duckdb
import os
class Index:
def __init__(self, db):
self.cursor = duckdb.connect(db).cursor()
self.stopwords = []
if os.path.exists(db):
os.remove(db)
self.create_table()
# def init_db(self):
# max_termid = self.cursor.execute('SELECT MAX(termid) + 1 FROM terms').fetchone()[0]
# max_termid = 1 if max_termid is None else max_termid[0]
#
# max_docid = self.cursor.execute('SELECT MAX(docid) + 1 FROM docs').fetchone()[0]
# max_docid = 1 if max_docid is None else max_docid[0]
#
# self.cursor.execute()
# self.cursor.execute(f'CREATE SEQUENCE term_ids START WITH {max_termid}')
# self.cursor.execute(f'CREATE SEQUENCE doc_ids START WITH ({max_docid})')
def create_table(self):
self.cursor.execute(f'CREATE SEQUENCE term_ids')
self.cursor.execute(f'CREATE SEQUENCE doc_ids')
self.cursor.execute('CREATE TABLE dict('
'termid INTEGER NOT NULL,'
'term VARCHAR NOT NULL,'
'df INTEGER NOT NULL,'
'PRIMARY KEY (termid))')
self.cursor.execute('CREATE TABLE docs('
'docid INTEGER NOT NULL,'
'name VARCHAR NOT NULL,'
'length INTEGER NOT NULL,'
'PRIMARY KEY (docid))')
self.cursor.execute('CREATE TABLE terms('
'termid INTEGER NOT NULL,'
'docid INTEGER NOT NULL,'
'count INTEGER NOT NULL)')
def get_terms(self, body):
terms = body.split()
return [term.lower() for term in terms if term.lower() not in self.stopwords]
def index(self, document):
terms = self.get_terms(document['body'])
for term in terms:
term_id = self.cursor.execute(f"SELECT termid FROM dict WHERE term = '{term}'").fetchone()
print(term_id)
if term_id is None:
self.cursor.execute(f"INSERT INTO dict VALUES (nextval('term_ids'), '{term}', 1)")
else:
self.cursor.execute(f"UPDATE dict SET df = df + 1 WHERE termid = {term_id[0]}")
print(self.cursor.execute('SELECT * FROM dict').fetchall())
index = Index('docs.db')
index.index({
'body': 'I I put on my robe and wizard hat',
'name': 'doc1',
})
[[package]]
category = "main"
description = "DuckDB embedded database"
name = "duckdb"
optional = false
python-versions = "*"
version = "0.1.1"
[package.dependencies]
numpy = ">=1.14"
pandas = ">=0.23"
[[package]]
category = "main"
description = "NumPy is the fundamental package for array computing with Python."
name = "numpy"
optional = false
python-versions = ">=3.5"
version = "1.17.3"
[[package]]
category = "main"
description = "Powerful data structures for data analysis, time series, and statistics"
name = "pandas"
optional = false
python-versions = ">=3.5.3"
version = "0.25.3"
[package.dependencies]
numpy = ">=1.13.3"
python-dateutil = ">=2.6.1"
pytz = ">=2017.2"
[[package]]
category = "main"
description = "Extensions to the standard Python datetime module"
name = "python-dateutil"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
version = "2.8.1"
[package.dependencies]
six = ">=1.5"
[[package]]
category = "main"
description = "World timezone definitions, modern and historical"
name = "pytz"
optional = false
python-versions = "*"
version = "2019.3"
[[package]]
category = "main"
description = "Python 2 and 3 compatibility utilities"
name = "six"
optional = false
python-versions = ">=2.6, !=3.0.*, !=3.1.*"
version = "1.13.0"
[metadata]
content-hash = "f1f0c6ddf09ca82467ae759f4c23a8485d3efaebe06d3a8a1f1e223ef9726a54"
python-versions = "^3.6"
[metadata.hashes]
duckdb = ["2272cc1f8a6496b1e1c38dac09d5de054bb5ec668a8767166f7b0254f7d61275", "2273dd14dc8a5a97da07d088a61dafc4cb647b97366ef4b16fd6e25ebc8f049b", "357c6de793c419ac0890f9eb019b386a799407f7a34451e4318035526944ea68", "3d52673dcc53c37e00a6dd225725bc9117ae89f1c1c998438295e8086eab617e", "439211f22b106e242b33143e882c68b0a12af4b4d99e8a6f2c89719f189aab91", "43aee1ae234dd84e0ec632c352dc25cd0caa7b3950ffdfb7886f0ff9255c7e7d", "77d22b198a72d782281aa40156e737be566c007426de26433ea00edd67fb6c05", "88a659064b1135bb4b3fc0c23638ce2544458fea82967485f4f5dd9d1013f699", "93dc01f26980fc620e9d25184502fee351b462722e99cc910bb73e2e60b690e4", "d31eda4dddb9ba8b9d9bf83ead74d82c069d76617666d57c05c782cab5ddd59b", "ec9cecb93394617412b13cd782e8f3bf22d8be07d271fd78c49d233c33c5fb5a"]
numpy = ["0b0dd8f47fb177d00fa6ef2d58783c4f41ad3126b139c91dd2f7c4b3fdf5e9a5", "25ffe71f96878e1da7e014467e19e7db90ae7d4e12affbc73101bcf61785214e", "26efd7f7d755e6ca966a5c0ac5a930a87dbbaab1c51716ac26a38f42ecc9bc4b", "28b1180c758abf34a5c3fea76fcee66a87def1656724c42bb14a6f9717a5bdf7", "2e418f0a59473dac424f888dd57e85f77502a593b207809211c76e5396ae4f5c", "30c84e3a62cfcb9e3066f25226e131451312a044f1fe2040e69ce792cb7de418", "4650d94bb9c947151737ee022b934b7d9a845a7c76e476f3e460f09a0c8c6f39", "4dd830a11e8724c9c9379feed1d1be43113f8bcce55f47ea7186d3946769ce26", "4f2a2b279efde194877aff1f76cf61c68e840db242a5c7169f1ff0fd59a2b1e2", "62d22566b3e3428dfc9ec972014c38ed9a4db4f8969c78f5414012ccd80a149e", "669795516d62f38845c7033679c648903200980d68935baaa17ac5c7ae03ae0c", "75fcd60d682db3e1f8fbe2b8b0c6761937ad56d01c1dc73edf4ef2748d5b6bc4", "9395b0a41e8b7e9a284e3be7060db9d14ad80273841c952c83a5afc241d2bd98", "9e37c35fc4e9410093b04a77d11a34c64bf658565e30df7cbe882056088a91c1", "a0678793096205a4d784bd99f32803ba8100f639cf3b932dc63b21621390ea7e", "b46554ad4dafb2927f88de5a1d207398c5385edbb5c84d30b3ef187c4a3894d8", "c867eeccd934920a800f65c6068acdd6b87e80d45cd8c8beefff783b23cdc462", "dd0667f5be56fb1b570154c2c0516a528e02d50da121bbbb2cbb0b6f87f59bc2", "de2b1c20494bdf47f0160bd88ed05f5e48ae5dc336b8de7cfade71abcc95c0b9", "f1df7b2b7740dd777571c732f98adb5aad5450aee32772f1b39249c8a50386f6", "ffca69e29079f7880c5392bf675eb8b4146479d976ae1924d01cd92b04cccbcc"]
pandas = ["00dff3a8e337f5ed7ad295d98a31821d3d0fe7792da82d78d7fd79b89c03ea9d", "22361b1597c8c2ffd697aa9bf85423afa9e1fcfa6b1ea821054a244d5f24d75e", "255920e63850dc512ce356233081098554d641ba99c3767dde9e9f35630f994b", "26382aab9c119735908d94d2c5c08020a4a0a82969b7e5eefb92f902b3b30ad7", "33970f4cacdd9a0ddb8f21e151bfb9f178afb7c36eb7c25b9094c02876f385c2", "4545467a637e0e1393f7d05d61dace89689ad6d6f66f267f86fff737b702cce9", "52da74df8a9c9a103af0a72c9d5fdc8e0183a90884278db7f386b5692a2220a4", "61741f5aeb252f39c3031d11405305b6d10ce663c53bc3112705d7ad66c013d0", "6a3ac2c87e4e32a969921d1428525f09462770c349147aa8e9ab95f88c71ec71", "7458c48e3d15b8aaa7d575be60e1e4dd70348efcd9376656b72fecd55c59a4c3", "78bf638993219311377ce9836b3dc05f627a666d0dbc8cec37c0ff3c9ada673b", "8153705d6545fd9eb6dd2bc79301bff08825d2e2f716d5dced48daafc2d0b81f", "975c461accd14e89d71772e89108a050fa824c0b87a67d34cedf245f6681fc17", "9962957a27bfb70ab64103d0a7b42fa59c642fb4ed4cb75d0227b7bb9228535d", "adc3d3a3f9e59a38d923e90e20c4922fc62d1e5a03d083440468c6d8f3f1ae0a", "bbe3eb765a0b1e578833d243e2814b60c825b7fdbf4cdfe8e8aae8a08ed56ecf", "df8864824b1fe488cf778c3650ee59c3a0d8f42e53707de167ba6b4f7d35f133", "e45055c30a608076e31a9fcd780a956ed3b1fa20db61561b8d88b79259f526f7", "ee50c2142cdcf41995655d499a157d0a812fce55c97d9aad13bc1eef837ed36c"]
python-dateutil = ["73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", "75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"]
pytz = ["1c557d7d0e871de1f5ccd5833f60fb2550652da6be2693c1e02300743d21500d", "b02c06db6cf09c12dd25137e563b31700d3b80fcc4ad23abb7a315f2789819be"]
six = ["1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd", "30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66"]
[tool.poetry]
name = "OldDuck"
version = "0.1.0"
description = "Teaching old ducks new tricks; a python implementation of OldDog, a column store based IR system, using DuckDB"
authors = ["Your Name <you@example.com>"]
[tool.poetry.dependencies]
python = "^3.6"
duckdb = "^0.1.1"
[tool.poetry.dev-dependencies]
[build-system]
requires = ["poetry>=0.12"]
build-backend = "poetry.masonry.api"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment