Commit 755e1489 authored by Gijs Hendriksen's avatar Gijs Hendriksen

Fix MonetDB bulk index and add installation instructions

parent 1fc7afbc
# Installation
* Install poetry
* Run `poetry install` to install the virtual environment
* Run `poetry shell` to enter the virtual environment
## MonetDB
For MonetDB, you have to setup the actual database. You can do this as follows:
* Run the MonetDB docker container:
```
$ docker run -d -p 0.0.0.0:50000:50000 --name monetdb --volume /path/to/project/.monetdb:/app monetdb/monetdb
```
* Connect to the container using:
```
$ docker exec -it monetdb /bin/bash
```
* In the docker shell, run the following commands:
```
# monetdb create oldduck
# monetdb release oldduck
# monetdb start oldduck
```
...@@ -183,7 +183,7 @@ class DuckDBIndex(Index): ...@@ -183,7 +183,7 @@ class DuckDBIndex(Index):
docs.append({ docs.append({
'docid': docid, 'docid': docid,
'name': document['name'], 'name': document['name'],
'length': len(doc_terms), 'length': sum(doc_terms.values()),
}) })
amount_of_digits = math.floor(math.log10(len(data))) + 1 amount_of_digits = math.floor(math.log10(len(data))) + 1
...@@ -241,12 +241,15 @@ class MonetDBIndex(Index): ...@@ -241,12 +241,15 @@ class MonetDBIndex(Index):
hostname='localhost', database=db).cursor() hostname='localhost', database=db).cursor()
self.cursor.execute('CREATE TABLE IF NOT EXISTS dict(' self.cursor.execute('CREATE TABLE IF NOT EXISTS dict('
'termid INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,' # 'termid INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,'
'term VARCHAR(32) NOT NULL UNIQUE,' 'termid INTEGER NOT NULL,'
# 'term VARCHAR(32) NOT NULL UNIQUE,'
'term VARCHAR(64) NOT NULL,'
'df INTEGER NOT NULL)') 'df INTEGER NOT NULL)')
self.cursor.execute('CREATE TABLE IF NOT EXISTS docs(' self.cursor.execute('CREATE TABLE IF NOT EXISTS docs('
'docid INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,' # 'docid INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,'
'name VARCHAR(32) NOT NULL,' 'docid INTEGER NOT NULL,'
'name VARCHAR(64) NOT NULL,'
'length INTEGER NOT NULL)') 'length INTEGER NOT NULL)')
self.cursor.execute('CREATE TABLE IF NOT EXISTS terms(' self.cursor.execute('CREATE TABLE IF NOT EXISTS terms('
'termid INTEGER NOT NULL,' 'termid INTEGER NOT NULL,'
...@@ -323,7 +326,7 @@ class MonetDBIndex(Index): ...@@ -323,7 +326,7 @@ class MonetDBIndex(Index):
docs.append({ docs.append({
'docid': docid, 'docid': docid,
'name': document['name'], 'name': document['name'],
'length': len(doc_terms), 'length': sum(doc_terms.values()),
}) })
amount_of_digits = math.floor(math.log10(len(data))) + 1 amount_of_digits = math.floor(math.log10(len(data))) + 1
...@@ -334,17 +337,15 @@ class MonetDBIndex(Index): ...@@ -334,17 +337,15 @@ class MonetDBIndex(Index):
dict_table['termid'] = dict_table.index dict_table['termid'] = dict_table.index
dict_table = dict_table[['termid', 'term', 'df']] dict_table = dict_table[['termid', 'term', 'df']]
dict_table.to_csv('dict.csv', header=False, index=False) dict_table.to_csv('.monetdb/dict.csv', header=False, index=False)
doc_table.to_csv('docs.csv', header=False, index=False) doc_table.to_csv('.monetdb/docs.csv', header=False, index=False)
term_table.to_csv('terms.csv', header=False, index=False) term_table.to_csv('.monetdb/terms.csv', header=False, index=False)
for table in ('dict', 'docs', 'terms'): for table in ('dict', 'docs', 'terms'):
filename = os.path.abspath(f'{table}.csv')
self.cursor.execute(f'DELETE FROM {table}') self.cursor.execute(f'DELETE FROM {table}')
self.cursor.execute(f"COPY INTO {table} FROM '{filename}'") self.cursor.execute(f"COPY INTO {table} FROM '/app/{table}.csv' USING DELIMITERS ','")
os.remove(f'{table}.csv') os.remove(f'.monetdb/{table}.csv')
self.cursor.execute('COMMIT') self.cursor.execute('COMMIT')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment