Commit 755e1489 authored by Gijs Hendriksen's avatar Gijs Hendriksen

Fix MonetDB bulk index and add installation instructions

parent 1fc7afbc
# Installation
* Install poetry
* Run `poetry install` to install the virtual environment
* Run `poetry shell` to enter the virtual environment
## MonetDB
For MonetDB, you have to setup the actual database. You can do this as follows:
* Run the MonetDB docker container:
```
$ docker run -d -p 0.0.0.0:50000:50000 --name monetdb --volume /path/to/project/.monetdb:/app monetdb/monetdb
```
* Connect to the container using:
```
$ docker exec -it monetdb /bin/bash
```
* In the docker shell, run the following commands:
```
# monetdb create oldduck
# monetdb release oldduck
# monetdb start oldduck
```
......@@ -183,7 +183,7 @@ class DuckDBIndex(Index):
docs.append({
'docid': docid,
'name': document['name'],
'length': len(doc_terms),
'length': sum(doc_terms.values()),
})
amount_of_digits = math.floor(math.log10(len(data))) + 1
......@@ -241,12 +241,15 @@ class MonetDBIndex(Index):
hostname='localhost', database=db).cursor()
self.cursor.execute('CREATE TABLE IF NOT EXISTS dict('
'termid INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,'
'term VARCHAR(32) NOT NULL UNIQUE,'
# 'termid INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,'
'termid INTEGER NOT NULL,'
# 'term VARCHAR(32) NOT NULL UNIQUE,'
'term VARCHAR(64) NOT NULL,'
'df INTEGER NOT NULL)')
self.cursor.execute('CREATE TABLE IF NOT EXISTS docs('
'docid INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,'
'name VARCHAR(32) NOT NULL,'
# 'docid INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,'
'docid INTEGER NOT NULL,'
'name VARCHAR(64) NOT NULL,'
'length INTEGER NOT NULL)')
self.cursor.execute('CREATE TABLE IF NOT EXISTS terms('
'termid INTEGER NOT NULL,'
......@@ -323,7 +326,7 @@ class MonetDBIndex(Index):
docs.append({
'docid': docid,
'name': document['name'],
'length': len(doc_terms),
'length': sum(doc_terms.values()),
})
amount_of_digits = math.floor(math.log10(len(data))) + 1
......@@ -334,17 +337,15 @@ class MonetDBIndex(Index):
dict_table['termid'] = dict_table.index
dict_table = dict_table[['termid', 'term', 'df']]
dict_table.to_csv('dict.csv', header=False, index=False)
doc_table.to_csv('docs.csv', header=False, index=False)
term_table.to_csv('terms.csv', header=False, index=False)
dict_table.to_csv('.monetdb/dict.csv', header=False, index=False)
doc_table.to_csv('.monetdb/docs.csv', header=False, index=False)
term_table.to_csv('.monetdb/terms.csv', header=False, index=False)
for table in ('dict', 'docs', 'terms'):
filename = os.path.abspath(f'{table}.csv')
self.cursor.execute(f'DELETE FROM {table}')
self.cursor.execute(f"COPY INTO {table} FROM '{filename}'")
self.cursor.execute(f"COPY INTO {table} FROM '/app/{table}.csv' USING DELIMITERS ','")
os.remove(f'{table}.csv')
os.remove(f'.monetdb/{table}.csv')
self.cursor.execute('COMMIT')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment