Commit f8f1c0a3 authored by Gijs Hendriksen's avatar Gijs Hendriksen

Complete indexing step, add wrapper for index operations

parent 402b2c93
.idea/
__pycache__/
*.db
*.db.wal
import duckdb
import os
from collections import defaultdict
class Index:
def __init__(self, db):
db_exists = os.path.exists(db)
self.cursor = duckdb.connect(db).cursor()
self.stopwords = []
self.stopwords = ['i', 'on', 'my', 'and', 'in', 'the']
if os.path.exists(db):
os.remove(db)
if db_exists:
self.reset_auto_increment()
else:
self.init_db()
self.create_table()
def reset_auto_increment(self):
max_termid = self.cursor.execute('SELECT MAX(termid) FROM terms').fetchone()[0]
max_termid = 1 if max_termid is None else max_termid + 1
# def init_db(self):
# max_termid = self.cursor.execute('SELECT MAX(termid) + 1 FROM terms').fetchone()[0]
# max_termid = 1 if max_termid is None else max_termid[0]
#
# max_docid = self.cursor.execute('SELECT MAX(docid) + 1 FROM docs').fetchone()[0]
# max_docid = 1 if max_docid is None else max_docid[0]
#
# self.cursor.execute()
# self.cursor.execute(f'CREATE SEQUENCE term_ids START WITH {max_termid}')
# self.cursor.execute(f'CREATE SEQUENCE doc_ids START WITH ({max_docid})')
max_docid = self.cursor.execute('SELECT MAX(docid) FROM docs').fetchone()[0]
max_docid = 1 if max_docid is None else max_docid + 1
def create_table(self):
self.cursor.execute('DROP SEQUENCE term_ids')
self.cursor.execute('DROP SEQUENCE doc_ids')
self.cursor.execute(f'CREATE SEQUENCE term_ids START WITH {max_termid}')
self.cursor.execute(f'CREATE SEQUENCE doc_ids START WITH {max_docid}')
def init_db(self):
self.cursor.execute(f'CREATE SEQUENCE term_ids')
self.cursor.execute(f'CREATE SEQUENCE doc_ids')
self.cursor.execute('CREATE TABLE dict('
'termid INTEGER NOT NULL,'
'term VARCHAR NOT NULL,'
'df INTEGER NOT NULL,'
'PRIMARY KEY (termid))')
'df INTEGER NOT NULL)')
self.cursor.execute('CREATE TABLE docs('
'docid INTEGER NOT NULL,'
'name VARCHAR NOT NULL,'
'length INTEGER NOT NULL,'
'PRIMARY KEY (docid))')
'length INTEGER NOT NULL)')
self.cursor.execute('CREATE TABLE terms('
'termid INTEGER NOT NULL,'
'docid INTEGER NOT NULL,'
'count INTEGER NOT NULL)')
def get_terms(self, body):
terms = body.split()
return [term.lower() for term in terms if term.lower() not in self.stopwords]
terms = defaultdict(int)
for term in body.lower().split():
if term not in self.stopwords:
terms[term] += 1
return terms
def index(self, document):
terms = self.get_terms(document['body'])
for term in terms:
doc_name = document['name']
doc_length = len(terms)
doc_id = self.cursor.execute("SELECT nextval('doc_ids')").fetchone()[0]
self.cursor.execute(f"INSERT INTO docs VALUES ({doc_id}, '{doc_name}', {doc_length})")
for term, frequency in terms.items():
term_id = self.cursor.execute(f"SELECT termid FROM dict WHERE term = '{term}'").fetchone()
print(term_id)
if term_id is None:
self.cursor.execute(f"INSERT INTO dict VALUES (nextval('term_ids'), '{term}', 1)")
term_id = self.cursor.execute("SELECT nextval('term_ids')").fetchone()[0]
self.cursor.execute(f"INSERT INTO dict VALUES ({term_id}, '{term}', 1)")
else:
self.cursor.execute(f"UPDATE dict SET df = df + 1 WHERE termid = {term_id[0]}")
term_id = term_id[0]
self.cursor.execute(f"UPDATE dict SET df = df + 1 WHERE termid = {term_id}")
self.cursor.execute(f"INSERT INTO terms VALUES ({term_id}, {doc_id}, {frequency})")
def bulk_index(self, filename):
# TODO read data from filename and index documents
pass
print(self.cursor.execute('SELECT * FROM dict').fetchall())
def print_index(self):
print('dict')
dict_rows = self.cursor.execute('SELECT * FROM dict').fetchall()
for row in dict_rows:
print('\t'.join(map(str, row)))
print('terms')
term_rows = self.cursor.execute('SELECT * FROM terms').fetchall()
for row in term_rows:
print('\t'.join(map(str, row)))
index = Index('docs.db')
index.index({
'body': 'I I put on my robe and wizard hat',
'name': 'doc1',
})
print('docs')
doc_rows = self.cursor.execute('SELECT * FROM docs').fetchall()
for row in doc_rows:
print('\t'.join(map(str, row)))
from argparse import ArgumentParser
from index import Index
def bulk_index(index, args):
filename = args.data
index.bulk_index(filename)
def query_index(index, args):
query_terms = args.terms
# TODO use query terms to query index
def main():
parser = ArgumentParser(prog='old_duck', description='OldDuck - A Python implementation of OldDog, using DuckDB')
subparsers = parser.add_subparsers()
parser_index = subparsers.add_parser('index')
parser_index.add_argument('database', help='The database file to index the files to')
parser_index.add_argument('data', help='The file to read and index documents from')
parser.set_defaults(func=bulk_index)
parser_query = subparsers.add_parser('query')
parser_query.add_argument('database', help='The database file to index the files to')
parser_query.add_argument('terms', help='The query terms', nargs='*')
parser.set_defaults(func=query_index)
args = parser.parse_args()
index = Index(args.database)
args.func(index, args)
if __name__ == '__main__':
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment