Commit f5572c0e authored by Gijs Hendriksen's avatar Gijs Hendriksen

Type hinting and better output formatting

parent df3bb8ca
......@@ -2,14 +2,19 @@ import duckdb
import json
import os
from collections import defaultdict
from nltk import corpus
from typing import List
class Index:
def __init__(self, db):
cursor: duckdb.Cursor
stopwords: List[str]
def __init__(self, db: str, stopwords: List[str] = None):
db_exists = os.path.exists(db)
self.cursor = duckdb.connect(db).cursor()
self.stopwords = ['i', 'on', 'my', 'and', 'in', 'the']
self.stopwords = stopwords or corpus.stopwords.words('english')
if db_exists:
self.reset_auto_increment()
......@@ -44,7 +49,7 @@ class Index:
'docid INTEGER NOT NULL,'
'count INTEGER NOT NULL)')
def get_terms(self, body):
def get_terms(self, body: str):
terms = defaultdict(int)
for term in body.lower().split():
......@@ -53,7 +58,7 @@ class Index:
return terms
def index(self, document):
def index(self, document: dict):
terms = self.get_terms(document['body'])
doc_name = document['name']
doc_length = len(terms)
......@@ -73,9 +78,13 @@ class Index:
self.cursor.execute(f"INSERT INTO terms VALUES ({term_id}, {doc_id}, {frequency})")
def bulk_index(self, filename):
with open(filename) as _file:
data = json.load(_file)
def bulk_index(self, filename: str):
try:
with open(filename) as _file:
data = json.load(_file)
except json.JSONDecodeError:
print('[!] Invalid input file!')
return
for document in data:
self.index(document)
......@@ -85,18 +94,21 @@ class Index:
self.cursor.execute("DELETE FROM docs")
self.cursor.execute("DELETE FROM dict")
def print_index(self):
print('dict')
dict_rows = self.cursor.execute('SELECT * FROM dict').fetchall()
for row in dict_rows:
print('\t'.join(map(str, row)))
print('terms')
term_rows = self.cursor.execute('SELECT * FROM terms').fetchall()
for row in term_rows:
print('\t'.join(map(str, row)))
print('docs')
doc_rows = self.cursor.execute('SELECT * FROM docs').fetchall()
for row in doc_rows:
print('\t'.join(map(str, row)))
def __str__(self):
dict_rows = self.cursor.execute('SELECT * FROM dict').fetchdf()
term_rows = self.cursor.execute('SELECT * FROM terms').fetchdf()
doc_rows = self.cursor.execute('SELECT * FROM docs').fetchdf()
return '\n'.join([
'dict',
'-' * 20,
str(dict_rows),
'',
'terms',
'-' * 20,
str(term_rows),
'',
'docs',
'-' * 20,
str(doc_rows),
])
from argparse import ArgumentParser
import argparse
from index import Index
def bulk_index(index, args):
def bulk_index(index: Index, args: argparse.Namespace):
filename = args.data
index.bulk_index(filename)
index.print_index()
print(index)
def query_index(index, args):
def query_index(index: Index, args: argparse.Namespace):
query_terms = args.terms
# TODO use query terms to query index
def clear_index(index, args):
def clear_index(index: Index, args: argparse.Namespace):
index.clear()
def main():
parser = ArgumentParser(prog='old_duck', description='OldDuck - A Python implementation of OldDog, using DuckDB')
parser = argparse.ArgumentParser(prog='old_duck',
description='OldDuck - A Python implementation of OldDog, using DuckDB')
subparsers = parser.add_subparsers()
......
......@@ -10,6 +10,17 @@ version = "0.1.1"
numpy = ">=1.14"
pandas = ">=0.23"
[[package]]
category = "main"
description = "Natural Language Toolkit"
name = "nltk"
optional = false
python-versions = "*"
version = "3.4.5"
[package.dependencies]
six = "*"
[[package]]
category = "main"
description = "NumPy is the fundamental package for array computing with Python."
......@@ -59,11 +70,12 @@ python-versions = ">=2.6, !=3.0.*, !=3.1.*"
version = "1.13.0"
[metadata]
content-hash = "f1f0c6ddf09ca82467ae759f4c23a8485d3efaebe06d3a8a1f1e223ef9726a54"
content-hash = "7ef45e999a2464d5a8aeb70a370dbd0e2806b146e945e1ee679294a69d61c430"
python-versions = "^3.6"
[metadata.hashes]
duckdb = ["2272cc1f8a6496b1e1c38dac09d5de054bb5ec668a8767166f7b0254f7d61275", "2273dd14dc8a5a97da07d088a61dafc4cb647b97366ef4b16fd6e25ebc8f049b", "357c6de793c419ac0890f9eb019b386a799407f7a34451e4318035526944ea68", "3d52673dcc53c37e00a6dd225725bc9117ae89f1c1c998438295e8086eab617e", "439211f22b106e242b33143e882c68b0a12af4b4d99e8a6f2c89719f189aab91", "43aee1ae234dd84e0ec632c352dc25cd0caa7b3950ffdfb7886f0ff9255c7e7d", "77d22b198a72d782281aa40156e737be566c007426de26433ea00edd67fb6c05", "88a659064b1135bb4b3fc0c23638ce2544458fea82967485f4f5dd9d1013f699", "93dc01f26980fc620e9d25184502fee351b462722e99cc910bb73e2e60b690e4", "d31eda4dddb9ba8b9d9bf83ead74d82c069d76617666d57c05c782cab5ddd59b", "ec9cecb93394617412b13cd782e8f3bf22d8be07d271fd78c49d233c33c5fb5a"]
nltk = ["a08bdb4b8a1c13de16743068d9eb61c8c71c2e5d642e8e08205c528035843f82", "bed45551259aa2101381bbdd5df37d44ca2669c5c3dad72439fa459b29137d94"]
numpy = ["0b0dd8f47fb177d00fa6ef2d58783c4f41ad3126b139c91dd2f7c4b3fdf5e9a5", "25ffe71f96878e1da7e014467e19e7db90ae7d4e12affbc73101bcf61785214e", "26efd7f7d755e6ca966a5c0ac5a930a87dbbaab1c51716ac26a38f42ecc9bc4b", "28b1180c758abf34a5c3fea76fcee66a87def1656724c42bb14a6f9717a5bdf7", "2e418f0a59473dac424f888dd57e85f77502a593b207809211c76e5396ae4f5c", "30c84e3a62cfcb9e3066f25226e131451312a044f1fe2040e69ce792cb7de418", "4650d94bb9c947151737ee022b934b7d9a845a7c76e476f3e460f09a0c8c6f39", "4dd830a11e8724c9c9379feed1d1be43113f8bcce55f47ea7186d3946769ce26", "4f2a2b279efde194877aff1f76cf61c68e840db242a5c7169f1ff0fd59a2b1e2", "62d22566b3e3428dfc9ec972014c38ed9a4db4f8969c78f5414012ccd80a149e", "669795516d62f38845c7033679c648903200980d68935baaa17ac5c7ae03ae0c", "75fcd60d682db3e1f8fbe2b8b0c6761937ad56d01c1dc73edf4ef2748d5b6bc4", "9395b0a41e8b7e9a284e3be7060db9d14ad80273841c952c83a5afc241d2bd98", "9e37c35fc4e9410093b04a77d11a34c64bf658565e30df7cbe882056088a91c1", "a0678793096205a4d784bd99f32803ba8100f639cf3b932dc63b21621390ea7e", "b46554ad4dafb2927f88de5a1d207398c5385edbb5c84d30b3ef187c4a3894d8", "c867eeccd934920a800f65c6068acdd6b87e80d45cd8c8beefff783b23cdc462", "dd0667f5be56fb1b570154c2c0516a528e02d50da121bbbb2cbb0b6f87f59bc2", "de2b1c20494bdf47f0160bd88ed05f5e48ae5dc336b8de7cfade71abcc95c0b9", "f1df7b2b7740dd777571c732f98adb5aad5450aee32772f1b39249c8a50386f6", "ffca69e29079f7880c5392bf675eb8b4146479d976ae1924d01cd92b04cccbcc"]
pandas = ["00dff3a8e337f5ed7ad295d98a31821d3d0fe7792da82d78d7fd79b89c03ea9d", "22361b1597c8c2ffd697aa9bf85423afa9e1fcfa6b1ea821054a244d5f24d75e", "255920e63850dc512ce356233081098554d641ba99c3767dde9e9f35630f994b", "26382aab9c119735908d94d2c5c08020a4a0a82969b7e5eefb92f902b3b30ad7", "33970f4cacdd9a0ddb8f21e151bfb9f178afb7c36eb7c25b9094c02876f385c2", "4545467a637e0e1393f7d05d61dace89689ad6d6f66f267f86fff737b702cce9", "52da74df8a9c9a103af0a72c9d5fdc8e0183a90884278db7f386b5692a2220a4", "61741f5aeb252f39c3031d11405305b6d10ce663c53bc3112705d7ad66c013d0", "6a3ac2c87e4e32a969921d1428525f09462770c349147aa8e9ab95f88c71ec71", "7458c48e3d15b8aaa7d575be60e1e4dd70348efcd9376656b72fecd55c59a4c3", "78bf638993219311377ce9836b3dc05f627a666d0dbc8cec37c0ff3c9ada673b", "8153705d6545fd9eb6dd2bc79301bff08825d2e2f716d5dced48daafc2d0b81f", "975c461accd14e89d71772e89108a050fa824c0b87a67d34cedf245f6681fc17", "9962957a27bfb70ab64103d0a7b42fa59c642fb4ed4cb75d0227b7bb9228535d", "adc3d3a3f9e59a38d923e90e20c4922fc62d1e5a03d083440468c6d8f3f1ae0a", "bbe3eb765a0b1e578833d243e2814b60c825b7fdbf4cdfe8e8aae8a08ed56ecf", "df8864824b1fe488cf778c3650ee59c3a0d8f42e53707de167ba6b4f7d35f133", "e45055c30a608076e31a9fcd780a956ed3b1fa20db61561b8d88b79259f526f7", "ee50c2142cdcf41995655d499a157d0a812fce55c97d9aad13bc1eef837ed36c"]
python-dateutil = ["73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", "75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"]
......
......@@ -7,6 +7,7 @@ authors = ["Your Name <you@example.com>"]
[tool.poetry.dependencies]
python = "^3.6"
duckdb = "^0.1.1"
nltk = "^3.4"
[tool.poetry.dev-dependencies]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment