Commit 7e890b22 authored by Gijs Hendriksen's avatar Gijs Hendriksen

Complete bulk indexing using dataframes and CSV import/export

parent 6e64d073
import duckdb import duckdb
import json import json
import os import os
import math
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from collections import defaultdict from collections import defaultdict
...@@ -106,6 +107,9 @@ class Index: ...@@ -106,6 +107,9 @@ class Index:
term_table = self.cursor.execute('SELECT * FROM terms').fetchdf() term_table = self.cursor.execute('SELECT * FROM terms').fetchdf()
doc_table = self.cursor.execute('SELECT * FROM docs').fetchdf() doc_table = self.cursor.execute('SELECT * FROM docs').fetchdf()
if not dict_table.empty:
dict_table.set_index('termid', inplace=True)
docs = [] docs = []
docid_start = 1 if doc_table.empty else doc_table['docid'].max() + 1 docid_start = 1 if doc_table.empty else doc_table['docid'].max() + 1
...@@ -128,7 +132,8 @@ class Index: ...@@ -128,7 +132,8 @@ class Index:
new_terms = dict_table.loc[dict_table['term'].isin(doc_terms)].copy() new_terms = dict_table.loc[dict_table['term'].isin(doc_terms)].copy()
new_terms['termid'] = new_terms.index new_terms['termid'] = new_terms.index
new_terms['docid'] = np.repeat(docid, len(doc_terms)) new_terms['docid'] = np.repeat(docid, len(doc_terms))
new_terms = new_terms.replace({'term': doc_terms}).rename(columns={'term': 'count'}).drop('df', 1) new_terms = (new_terms.replace({'term': doc_terms})
.rename(columns={'term': 'count'})[['termid', 'docid', 'count']])
term_table = pd.concat([term_table, new_terms], ignore_index=True) term_table = pd.concat([term_table, new_terms], ignore_index=True)
...@@ -138,16 +143,25 @@ class Index: ...@@ -138,16 +143,25 @@ class Index:
'length': len(doc_terms), 'length': len(doc_terms),
}) })
print(i) amount_of_digits = math.floor(math.log10(len(data))) + 1
print(f'{i:>{amount_of_digits}d}/{len(data)}', end='\r')
new_docs = pd.DataFrame(docs, columns=['docid', 'name', 'length']) new_docs = pd.DataFrame(docs, columns=['docid', 'name', 'length'])
doc_table = new_docs if doc_table.empty else pd.concat([doc_table, new_docs], ignore_index=True) doc_table = new_docs if doc_table.empty else pd.concat([doc_table, new_docs], ignore_index=True)
dict_table['termid'] = dict_table.index
dict_table = dict_table[['termid', 'term', 'df']]
dict_table.to_csv('dict.csv', header=False, index=False)
doc_table.to_csv('docs.csv', header=False, index=False)
term_table.to_csv('terms.csv', header=False, index=False)
print(f'Done in {time.time() - start:.4f} seconds!') for table in ('dict', 'docs', 'terms'):
self.cursor.execute(f'DELETE FROM {table}')
self.cursor.execute(f"COPY {table} FROM '{table}.csv'")
os.remove(f'{table}.csv')
# for i, document in enumerate(data): current = time.time()
# self.index(document) print(f'Indexed {len(data)} documents in {current - start:.2f} seconds!')
# print(i)
def search(self, query): def search(self, query):
self.cursor.execute(query) self.cursor.execute(query)
......
...@@ -17,8 +17,5 @@ class Search: ...@@ -17,8 +17,5 @@ class Search:
else: else:
raise NotImplementedError(f'Search method "{method}" was not implemented') raise NotImplementedError(f'Search method "{method}" was not implemented')
print(sql_query) result = self.index.search(sql_query)
print(result)
print(re.sub(r'\s+', ' ', sql_query))
print(self.index.search(re.sub(r'\s+', ' ', sql_query.strip())))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment