From f8f1c0a30d5166936ea5a48cb5b8e154157f06cf Mon Sep 17 00:00:00 2001 From: Gijs Hendriksen Date: Wed, 13 Nov 2019 15:14:45 +0100 Subject: [PATCH] Complete indexing step, add wrapper for index operations --- .gitignore | 1 + index.py | 88 +++++++++++++++++++++++++++++++++++------------------- main.py | 38 +++++++++++++++++++++++ 3 files changed, 96 insertions(+), 31 deletions(-) create mode 100644 main.py diff --git a/.gitignore b/.gitignore index 136c3c7..75fcaad 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .idea/ +__pycache__/ *.db *.db.wal diff --git a/index.py b/index.py index 1d7f114..04a9569 100644 --- a/index.py +++ b/index.py @@ -1,67 +1,93 @@ import duckdb import os +from collections import defaultdict class Index: def __init__(self, db): + db_exists = os.path.exists(db) self.cursor = duckdb.connect(db).cursor() - self.stopwords = [] + self.stopwords = ['i', 'on', 'my', 'and', 'in', 'the'] - if os.path.exists(db): - os.remove(db) + if db_exists: + self.reset_auto_increment() + else: + self.init_db() - self.create_table() + def reset_auto_increment(self): + max_termid = self.cursor.execute('SELECT MAX(termid) FROM terms').fetchone()[0] + max_termid = 1 if max_termid is None else max_termid + 1 - # def init_db(self): - # max_termid = self.cursor.execute('SELECT MAX(termid) + 1 FROM terms').fetchone()[0] - # max_termid = 1 if max_termid is None else max_termid[0] - # - # max_docid = self.cursor.execute('SELECT MAX(docid) + 1 FROM docs').fetchone()[0] - # max_docid = 1 if max_docid is None else max_docid[0] - # - # self.cursor.execute() - # self.cursor.execute(f'CREATE SEQUENCE term_ids START WITH {max_termid}') - # self.cursor.execute(f'CREATE SEQUENCE doc_ids START WITH ({max_docid})') + max_docid = self.cursor.execute('SELECT MAX(docid) FROM docs').fetchone()[0] + max_docid = 1 if max_docid is None else max_docid + 1 - def create_table(self): + self.cursor.execute('DROP SEQUENCE term_ids') + self.cursor.execute('DROP SEQUENCE doc_ids') + self.cursor.execute(f'CREATE SEQUENCE term_ids START WITH {max_termid}') + self.cursor.execute(f'CREATE SEQUENCE doc_ids START WITH {max_docid}') + + def init_db(self): self.cursor.execute(f'CREATE SEQUENCE term_ids') self.cursor.execute(f'CREATE SEQUENCE doc_ids') self.cursor.execute('CREATE TABLE dict(' 'termid INTEGER NOT NULL,' 'term VARCHAR NOT NULL,' - 'df INTEGER NOT NULL,' - 'PRIMARY KEY (termid))') + 'df INTEGER NOT NULL)') self.cursor.execute('CREATE TABLE docs(' 'docid INTEGER NOT NULL,' 'name VARCHAR NOT NULL,' - 'length INTEGER NOT NULL,' - 'PRIMARY KEY (docid))') + 'length INTEGER NOT NULL)') self.cursor.execute('CREATE TABLE terms(' 'termid INTEGER NOT NULL,' 'docid INTEGER NOT NULL,' 'count INTEGER NOT NULL)') def get_terms(self, body): - terms = body.split() - return [term.lower() for term in terms if term.lower() not in self.stopwords] + terms = defaultdict(int) + + for term in body.lower().split(): + if term not in self.stopwords: + terms[term] += 1 + + return terms def index(self, document): terms = self.get_terms(document['body']) - for term in terms: + doc_name = document['name'] + doc_length = len(terms) + doc_id = self.cursor.execute("SELECT nextval('doc_ids')").fetchone()[0] + + self.cursor.execute(f"INSERT INTO docs VALUES ({doc_id}, '{doc_name}', {doc_length})") + + for term, frequency in terms.items(): term_id = self.cursor.execute(f"SELECT termid FROM dict WHERE term = '{term}'").fetchone() - print(term_id) if term_id is None: - self.cursor.execute(f"INSERT INTO dict VALUES (nextval('term_ids'), '{term}', 1)") + term_id = self.cursor.execute("SELECT nextval('term_ids')").fetchone()[0] + self.cursor.execute(f"INSERT INTO dict VALUES ({term_id}, '{term}', 1)") else: - self.cursor.execute(f"UPDATE dict SET df = df + 1 WHERE termid = {term_id[0]}") + term_id = term_id[0] + self.cursor.execute(f"UPDATE dict SET df = df + 1 WHERE termid = {term_id}") + + self.cursor.execute(f"INSERT INTO terms VALUES ({term_id}, {doc_id}, {frequency})") + + def bulk_index(self, filename): + # TODO read data from filename and index documents + pass - print(self.cursor.execute('SELECT * FROM dict').fetchall()) + def print_index(self): + print('dict') + dict_rows = self.cursor.execute('SELECT * FROM dict').fetchall() + for row in dict_rows: + print('\t'.join(map(str, row))) + print('terms') + term_rows = self.cursor.execute('SELECT * FROM terms').fetchall() + for row in term_rows: + print('\t'.join(map(str, row))) -index = Index('docs.db') -index.index({ - 'body': 'I I put on my robe and wizard hat', - 'name': 'doc1', -}) + print('docs') + doc_rows = self.cursor.execute('SELECT * FROM docs').fetchall() + for row in doc_rows: + print('\t'.join(map(str, row))) diff --git a/main.py b/main.py new file mode 100644 index 0000000..665d6e3 --- /dev/null +++ b/main.py @@ -0,0 +1,38 @@ +from argparse import ArgumentParser + +from index import Index + + +def bulk_index(index, args): + filename = args.data + index.bulk_index(filename) + + +def query_index(index, args): + query_terms = args.terms + # TODO use query terms to query index + + +def main(): + parser = ArgumentParser(prog='old_duck', description='OldDuck - A Python implementation of OldDog, using DuckDB') + + subparsers = parser.add_subparsers() + + parser_index = subparsers.add_parser('index') + parser_index.add_argument('database', help='The database file to index the files to') + parser_index.add_argument('data', help='The file to read and index documents from') + parser.set_defaults(func=bulk_index) + + parser_query = subparsers.add_parser('query') + parser_query.add_argument('database', help='The database file to index the files to') + parser_query.add_argument('terms', help='The query terms', nargs='*') + parser.set_defaults(func=query_index) + + args = parser.parse_args() + + index = Index(args.database) + args.func(index, args) + + +if __name__ == '__main__': + main() -- GitLab