index.py 3.44 KB
Newer Older
Gijs Hendriksen's avatar
Gijs Hendriksen committed
1 2
import duckdb
import os
3
from collections import defaultdict
Gijs Hendriksen's avatar
Gijs Hendriksen committed
4 5 6 7


class Index:
    def __init__(self, db):
8
        db_exists = os.path.exists(db)
Gijs Hendriksen's avatar
Gijs Hendriksen committed
9 10
        self.cursor = duckdb.connect(db).cursor()

11
        self.stopwords = ['i', 'on', 'my', 'and', 'in', 'the']
Gijs Hendriksen's avatar
Gijs Hendriksen committed
12

13 14 15 16
        if db_exists:
            self.reset_auto_increment()
        else:
            self.init_db()
Gijs Hendriksen's avatar
Gijs Hendriksen committed
17

18 19 20
    def reset_auto_increment(self):
        max_termid = self.cursor.execute('SELECT MAX(termid) FROM terms').fetchone()[0]
        max_termid = 1 if max_termid is None else max_termid + 1
Gijs Hendriksen's avatar
Gijs Hendriksen committed
21

22 23
        max_docid = self.cursor.execute('SELECT MAX(docid) FROM docs').fetchone()[0]
        max_docid = 1 if max_docid is None else max_docid + 1
Gijs Hendriksen's avatar
Gijs Hendriksen committed
24

25 26 27 28 29 30
        self.cursor.execute('DROP SEQUENCE term_ids')
        self.cursor.execute('DROP SEQUENCE doc_ids')
        self.cursor.execute(f'CREATE SEQUENCE term_ids START WITH {max_termid}')
        self.cursor.execute(f'CREATE SEQUENCE doc_ids START WITH {max_docid}')

    def init_db(self):
Gijs Hendriksen's avatar
Gijs Hendriksen committed
31 32 33 34 35
        self.cursor.execute(f'CREATE SEQUENCE term_ids')
        self.cursor.execute(f'CREATE SEQUENCE doc_ids')
        self.cursor.execute('CREATE TABLE dict('
                            'termid INTEGER NOT NULL,'
                            'term VARCHAR NOT NULL,'
36
                            'df INTEGER NOT NULL)')
Gijs Hendriksen's avatar
Gijs Hendriksen committed
37 38 39
        self.cursor.execute('CREATE TABLE docs('
                            'docid INTEGER NOT NULL,'
                            'name VARCHAR NOT NULL,'
40
                            'length INTEGER NOT NULL)')
Gijs Hendriksen's avatar
Gijs Hendriksen committed
41 42 43 44 45 46
        self.cursor.execute('CREATE TABLE terms('
                            'termid INTEGER NOT NULL,'
                            'docid INTEGER NOT NULL,'
                            'count INTEGER NOT NULL)')

    def get_terms(self, body):
47 48 49 50 51 52 53
        terms = defaultdict(int)

        for term in body.lower().split():
            if term not in self.stopwords:
                terms[term] += 1

        return terms
Gijs Hendriksen's avatar
Gijs Hendriksen committed
54 55 56

    def index(self, document):
        terms = self.get_terms(document['body'])
57 58 59 60 61 62 63
        doc_name = document['name']
        doc_length = len(terms)
        doc_id = self.cursor.execute("SELECT nextval('doc_ids')").fetchone()[0]

        self.cursor.execute(f"INSERT INTO docs VALUES ({doc_id}, '{doc_name}', {doc_length})")

        for term, frequency in terms.items():
Gijs Hendriksen's avatar
Gijs Hendriksen committed
64 65 66
            term_id = self.cursor.execute(f"SELECT termid FROM dict WHERE term = '{term}'").fetchone()

            if term_id is None:
67 68
                term_id = self.cursor.execute("SELECT nextval('term_ids')").fetchone()[0]
                self.cursor.execute(f"INSERT INTO dict VALUES ({term_id}, '{term}', 1)")
Gijs Hendriksen's avatar
Gijs Hendriksen committed
69
            else:
70 71 72 73 74 75 76 77
                term_id = term_id[0]
                self.cursor.execute(f"UPDATE dict SET df = df + 1 WHERE termid = {term_id}")

            self.cursor.execute(f"INSERT INTO terms VALUES ({term_id}, {doc_id}, {frequency})")

    def bulk_index(self, filename):
        # TODO read data from filename and index documents
        pass
Gijs Hendriksen's avatar
Gijs Hendriksen committed
78

79 80 81 82 83
    def print_index(self):
        print('dict')
        dict_rows = self.cursor.execute('SELECT * FROM dict').fetchall()
        for row in dict_rows:
            print('\t'.join(map(str, row)))
Gijs Hendriksen's avatar
Gijs Hendriksen committed
84

85 86 87 88
        print('terms')
        term_rows = self.cursor.execute('SELECT * FROM terms').fetchall()
        for row in term_rows:
            print('\t'.join(map(str, row)))
Gijs Hendriksen's avatar
Gijs Hendriksen committed
89

90 91 92 93
        print('docs')
        doc_rows = self.cursor.execute('SELECT * FROM docs').fetchall()
        for row in doc_rows:
            print('\t'.join(map(str, row)))