diff --git a/index.py b/index.py index 516b7b8ede47a6a14c373dbc75e69b19f71a2a1e..62cdb57c4570e2a9c16b78a84b415aa898f45004 100644 --- a/index.py +++ b/index.py @@ -1,6 +1,7 @@ import duckdb import json import os +import math import numpy as np import pandas as pd from collections import defaultdict @@ -106,6 +107,9 @@ class Index: term_table = self.cursor.execute('SELECT * FROM terms').fetchdf() doc_table = self.cursor.execute('SELECT * FROM docs').fetchdf() + if not dict_table.empty: + dict_table.set_index('termid', inplace=True) + docs = [] docid_start = 1 if doc_table.empty else doc_table['docid'].max() + 1 @@ -128,7 +132,8 @@ class Index: new_terms = dict_table.loc[dict_table['term'].isin(doc_terms)].copy() new_terms['termid'] = new_terms.index new_terms['docid'] = np.repeat(docid, len(doc_terms)) - new_terms = new_terms.replace({'term': doc_terms}).rename(columns={'term': 'count'}).drop('df', 1) + new_terms = (new_terms.replace({'term': doc_terms}) + .rename(columns={'term': 'count'})[['termid', 'docid', 'count']]) term_table = pd.concat([term_table, new_terms], ignore_index=True) @@ -138,16 +143,25 @@ class Index: 'length': len(doc_terms), }) - print(i) + amount_of_digits = math.floor(math.log10(len(data))) + 1 + print(f'{i:>{amount_of_digits}d}/{len(data)}', end='\r') new_docs = pd.DataFrame(docs, columns=['docid', 'name', 'length']) doc_table = new_docs if doc_table.empty else pd.concat([doc_table, new_docs], ignore_index=True) + dict_table['termid'] = dict_table.index + dict_table = dict_table[['termid', 'term', 'df']] + + dict_table.to_csv('dict.csv', header=False, index=False) + doc_table.to_csv('docs.csv', header=False, index=False) + term_table.to_csv('terms.csv', header=False, index=False) - print(f'Done in {time.time() - start:.4f} seconds!') + for table in ('dict', 'docs', 'terms'): + self.cursor.execute(f'DELETE FROM {table}') + self.cursor.execute(f"COPY {table} FROM '{table}.csv'") + os.remove(f'{table}.csv') - # for i, document in enumerate(data): - # self.index(document) - # print(i) + current = time.time() + print(f'Indexed {len(data)} documents in {current - start:.2f} seconds!') def search(self, query): self.cursor.execute(query) diff --git a/search.py b/search.py index ce0c32fb0da212e9ac17e6e9e51dc6af4551fbec..828e7204015ba1815ab7540c90dfd6b476172f18 100644 --- a/search.py +++ b/search.py @@ -17,8 +17,5 @@ class Search: else: raise NotImplementedError(f'Search method "{method}" was not implemented') - print(sql_query) - - print(re.sub(r'\s+', ' ', sql_query)) - - print(self.index.search(re.sub(r'\s+', ' ', sql_query.strip()))) + result = self.index.search(sql_query) + print(result)