Commit 7e890b22 authored by Gijs Hendriksen's avatar Gijs Hendriksen

Complete bulk indexing using dataframes and CSV import/export

parent 6e64d073
import duckdb
import json
import os
import math
import numpy as np
import pandas as pd
from collections import defaultdict
......@@ -106,6 +107,9 @@ class Index:
term_table = self.cursor.execute('SELECT * FROM terms').fetchdf()
doc_table = self.cursor.execute('SELECT * FROM docs').fetchdf()
if not dict_table.empty:
dict_table.set_index('termid', inplace=True)
docs = []
docid_start = 1 if doc_table.empty else doc_table['docid'].max() + 1
......@@ -128,7 +132,8 @@ class Index:
new_terms = dict_table.loc[dict_table['term'].isin(doc_terms)].copy()
new_terms['termid'] = new_terms.index
new_terms['docid'] = np.repeat(docid, len(doc_terms))
new_terms = new_terms.replace({'term': doc_terms}).rename(columns={'term': 'count'}).drop('df', 1)
new_terms = (new_terms.replace({'term': doc_terms})
.rename(columns={'term': 'count'})[['termid', 'docid', 'count']])
term_table = pd.concat([term_table, new_terms], ignore_index=True)
......@@ -138,16 +143,25 @@ class Index:
'length': len(doc_terms),
})
print(i)
amount_of_digits = math.floor(math.log10(len(data))) + 1
print(f'{i:>{amount_of_digits}d}/{len(data)}', end='\r')
new_docs = pd.DataFrame(docs, columns=['docid', 'name', 'length'])
doc_table = new_docs if doc_table.empty else pd.concat([doc_table, new_docs], ignore_index=True)
dict_table['termid'] = dict_table.index
dict_table = dict_table[['termid', 'term', 'df']]
dict_table.to_csv('dict.csv', header=False, index=False)
doc_table.to_csv('docs.csv', header=False, index=False)
term_table.to_csv('terms.csv', header=False, index=False)
print(f'Done in {time.time() - start:.4f} seconds!')
for table in ('dict', 'docs', 'terms'):
self.cursor.execute(f'DELETE FROM {table}')
self.cursor.execute(f"COPY {table} FROM '{table}.csv'")
os.remove(f'{table}.csv')
# for i, document in enumerate(data):
# self.index(document)
# print(i)
current = time.time()
print(f'Indexed {len(data)} documents in {current - start:.2f} seconds!')
def search(self, query):
self.cursor.execute(query)
......
......@@ -17,8 +17,5 @@ class Search:
else:
raise NotImplementedError(f'Search method "{method}" was not implemented')
print(sql_query)
print(re.sub(r'\s+', ' ', sql_query))
print(self.index.search(re.sub(r'\s+', ' ', sql_query.strip())))
result = self.index.search(sql_query)
print(result)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment