diff --git a/index.py b/index.py index 696075e790e9f573c37a8468cb92b8fceeb28b7c..bda417898fd91d1a84426651631c37731a753bcb 100644 --- a/index.py +++ b/index.py @@ -2,14 +2,19 @@ import duckdb import json import os from collections import defaultdict +from nltk import corpus +from typing import List class Index: - def __init__(self, db): + cursor: duckdb.Cursor + stopwords: List[str] + + def __init__(self, db: str, stopwords: List[str] = None): db_exists = os.path.exists(db) self.cursor = duckdb.connect(db).cursor() - self.stopwords = ['i', 'on', 'my', 'and', 'in', 'the'] + self.stopwords = stopwords or corpus.stopwords.words('english') if db_exists: self.reset_auto_increment() @@ -44,7 +49,7 @@ class Index: 'docid INTEGER NOT NULL,' 'count INTEGER NOT NULL)') - def get_terms(self, body): + def get_terms(self, body: str): terms = defaultdict(int) for term in body.lower().split(): @@ -53,7 +58,7 @@ class Index: return terms - def index(self, document): + def index(self, document: dict): terms = self.get_terms(document['body']) doc_name = document['name'] doc_length = len(terms) @@ -73,9 +78,13 @@ class Index: self.cursor.execute(f"INSERT INTO terms VALUES ({term_id}, {doc_id}, {frequency})") - def bulk_index(self, filename): - with open(filename) as _file: - data = json.load(_file) + def bulk_index(self, filename: str): + try: + with open(filename) as _file: + data = json.load(_file) + except json.JSONDecodeError: + print('[!] Invalid input file!') + return for document in data: self.index(document) @@ -85,18 +94,21 @@ class Index: self.cursor.execute("DELETE FROM docs") self.cursor.execute("DELETE FROM dict") - def print_index(self): - print('dict') - dict_rows = self.cursor.execute('SELECT * FROM dict').fetchall() - for row in dict_rows: - print('\t'.join(map(str, row))) - - print('terms') - term_rows = self.cursor.execute('SELECT * FROM terms').fetchall() - for row in term_rows: - print('\t'.join(map(str, row))) - - print('docs') - doc_rows = self.cursor.execute('SELECT * FROM docs').fetchall() - for row in doc_rows: - print('\t'.join(map(str, row))) + def __str__(self): + dict_rows = self.cursor.execute('SELECT * FROM dict').fetchdf() + term_rows = self.cursor.execute('SELECT * FROM terms').fetchdf() + doc_rows = self.cursor.execute('SELECT * FROM docs').fetchdf() + + return '\n'.join([ + 'dict', + '-' * 20, + str(dict_rows), + '', + 'terms', + '-' * 20, + str(term_rows), + '', + 'docs', + '-' * 20, + str(doc_rows), + ]) diff --git a/main.py b/main.py index 951cdcc48361c2471031dbbd52f35531f2b34d66..162fa3bc54de646925d22d695f16eb9334452ad4 100644 --- a/main.py +++ b/main.py @@ -1,25 +1,27 @@ from argparse import ArgumentParser +import argparse from index import Index -def bulk_index(index, args): +def bulk_index(index: Index, args: argparse.Namespace): filename = args.data index.bulk_index(filename) - index.print_index() + print(index) -def query_index(index, args): +def query_index(index: Index, args: argparse.Namespace): query_terms = args.terms # TODO use query terms to query index -def clear_index(index, args): +def clear_index(index: Index, args: argparse.Namespace): index.clear() def main(): - parser = ArgumentParser(prog='old_duck', description='OldDuck - A Python implementation of OldDog, using DuckDB') + parser = argparse.ArgumentParser(prog='old_duck', + description='OldDuck - A Python implementation of OldDog, using DuckDB') subparsers = parser.add_subparsers() diff --git a/poetry.lock b/poetry.lock index 375bd194275492c1673d0e323f0a932e1a9bd80e..265e425ad9311f5045704de1704861e569bd505a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -10,6 +10,17 @@ version = "0.1.1" numpy = ">=1.14" pandas = ">=0.23" +[[package]] +category = "main" +description = "Natural Language Toolkit" +name = "nltk" +optional = false +python-versions = "*" +version = "3.4.5" + +[package.dependencies] +six = "*" + [[package]] category = "main" description = "NumPy is the fundamental package for array computing with Python." @@ -59,11 +70,12 @@ python-versions = ">=2.6, !=3.0.*, !=3.1.*" version = "1.13.0" [metadata] -content-hash = "f1f0c6ddf09ca82467ae759f4c23a8485d3efaebe06d3a8a1f1e223ef9726a54" +content-hash = "7ef45e999a2464d5a8aeb70a370dbd0e2806b146e945e1ee679294a69d61c430" python-versions = "^3.6" [metadata.hashes] duckdb = ["2272cc1f8a6496b1e1c38dac09d5de054bb5ec668a8767166f7b0254f7d61275", "2273dd14dc8a5a97da07d088a61dafc4cb647b97366ef4b16fd6e25ebc8f049b", "357c6de793c419ac0890f9eb019b386a799407f7a34451e4318035526944ea68", "3d52673dcc53c37e00a6dd225725bc9117ae89f1c1c998438295e8086eab617e", "439211f22b106e242b33143e882c68b0a12af4b4d99e8a6f2c89719f189aab91", "43aee1ae234dd84e0ec632c352dc25cd0caa7b3950ffdfb7886f0ff9255c7e7d", "77d22b198a72d782281aa40156e737be566c007426de26433ea00edd67fb6c05", "88a659064b1135bb4b3fc0c23638ce2544458fea82967485f4f5dd9d1013f699", "93dc01f26980fc620e9d25184502fee351b462722e99cc910bb73e2e60b690e4", "d31eda4dddb9ba8b9d9bf83ead74d82c069d76617666d57c05c782cab5ddd59b", "ec9cecb93394617412b13cd782e8f3bf22d8be07d271fd78c49d233c33c5fb5a"] +nltk = ["a08bdb4b8a1c13de16743068d9eb61c8c71c2e5d642e8e08205c528035843f82", "bed45551259aa2101381bbdd5df37d44ca2669c5c3dad72439fa459b29137d94"] numpy = ["0b0dd8f47fb177d00fa6ef2d58783c4f41ad3126b139c91dd2f7c4b3fdf5e9a5", "25ffe71f96878e1da7e014467e19e7db90ae7d4e12affbc73101bcf61785214e", "26efd7f7d755e6ca966a5c0ac5a930a87dbbaab1c51716ac26a38f42ecc9bc4b", "28b1180c758abf34a5c3fea76fcee66a87def1656724c42bb14a6f9717a5bdf7", "2e418f0a59473dac424f888dd57e85f77502a593b207809211c76e5396ae4f5c", "30c84e3a62cfcb9e3066f25226e131451312a044f1fe2040e69ce792cb7de418", "4650d94bb9c947151737ee022b934b7d9a845a7c76e476f3e460f09a0c8c6f39", "4dd830a11e8724c9c9379feed1d1be43113f8bcce55f47ea7186d3946769ce26", "4f2a2b279efde194877aff1f76cf61c68e840db242a5c7169f1ff0fd59a2b1e2", "62d22566b3e3428dfc9ec972014c38ed9a4db4f8969c78f5414012ccd80a149e", "669795516d62f38845c7033679c648903200980d68935baaa17ac5c7ae03ae0c", "75fcd60d682db3e1f8fbe2b8b0c6761937ad56d01c1dc73edf4ef2748d5b6bc4", "9395b0a41e8b7e9a284e3be7060db9d14ad80273841c952c83a5afc241d2bd98", "9e37c35fc4e9410093b04a77d11a34c64bf658565e30df7cbe882056088a91c1", "a0678793096205a4d784bd99f32803ba8100f639cf3b932dc63b21621390ea7e", "b46554ad4dafb2927f88de5a1d207398c5385edbb5c84d30b3ef187c4a3894d8", "c867eeccd934920a800f65c6068acdd6b87e80d45cd8c8beefff783b23cdc462", "dd0667f5be56fb1b570154c2c0516a528e02d50da121bbbb2cbb0b6f87f59bc2", "de2b1c20494bdf47f0160bd88ed05f5e48ae5dc336b8de7cfade71abcc95c0b9", "f1df7b2b7740dd777571c732f98adb5aad5450aee32772f1b39249c8a50386f6", "ffca69e29079f7880c5392bf675eb8b4146479d976ae1924d01cd92b04cccbcc"] pandas = ["00dff3a8e337f5ed7ad295d98a31821d3d0fe7792da82d78d7fd79b89c03ea9d", "22361b1597c8c2ffd697aa9bf85423afa9e1fcfa6b1ea821054a244d5f24d75e", "255920e63850dc512ce356233081098554d641ba99c3767dde9e9f35630f994b", "26382aab9c119735908d94d2c5c08020a4a0a82969b7e5eefb92f902b3b30ad7", "33970f4cacdd9a0ddb8f21e151bfb9f178afb7c36eb7c25b9094c02876f385c2", "4545467a637e0e1393f7d05d61dace89689ad6d6f66f267f86fff737b702cce9", "52da74df8a9c9a103af0a72c9d5fdc8e0183a90884278db7f386b5692a2220a4", "61741f5aeb252f39c3031d11405305b6d10ce663c53bc3112705d7ad66c013d0", "6a3ac2c87e4e32a969921d1428525f09462770c349147aa8e9ab95f88c71ec71", "7458c48e3d15b8aaa7d575be60e1e4dd70348efcd9376656b72fecd55c59a4c3", "78bf638993219311377ce9836b3dc05f627a666d0dbc8cec37c0ff3c9ada673b", "8153705d6545fd9eb6dd2bc79301bff08825d2e2f716d5dced48daafc2d0b81f", "975c461accd14e89d71772e89108a050fa824c0b87a67d34cedf245f6681fc17", "9962957a27bfb70ab64103d0a7b42fa59c642fb4ed4cb75d0227b7bb9228535d", "adc3d3a3f9e59a38d923e90e20c4922fc62d1e5a03d083440468c6d8f3f1ae0a", "bbe3eb765a0b1e578833d243e2814b60c825b7fdbf4cdfe8e8aae8a08ed56ecf", "df8864824b1fe488cf778c3650ee59c3a0d8f42e53707de167ba6b4f7d35f133", "e45055c30a608076e31a9fcd780a956ed3b1fa20db61561b8d88b79259f526f7", "ee50c2142cdcf41995655d499a157d0a812fce55c97d9aad13bc1eef837ed36c"] python-dateutil = ["73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", "75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"] diff --git a/pyproject.toml b/pyproject.toml index bff523b9b51944f393f580fa86c69e92b2b94d90..6e4004f368f2a4910b8722e42f569140c2354df5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ authors = ["Your Name "] [tool.poetry.dependencies] python = "^3.6" duckdb = "^0.1.1" +nltk = "^3.4" [tool.poetry.dev-dependencies]