Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Gijs Hendriksen
Information Retrieval
Commits
755e1489
Commit
755e1489
authored
Dec 06, 2019
by
Gijs Hendriksen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix MonetDB bulk index and add installation instructions
parent
1fc7afbc
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
38 additions
and
13 deletions
+38
-13
README.md
README.md
+24
-0
index.py
index.py
+14
-13
No files found.
README.md
0 → 100644
View file @
755e1489
# Installation
*
Install poetry
*
Run
`poetry install`
to install the virtual environment
*
Run
`poetry shell`
to enter the virtual environment
## MonetDB
For MonetDB, you have to setup the actual database. You can do this as follows:
*
Run the MonetDB docker container:
```
$ docker run -d -p 0.0.0.0:50000:50000 --name monetdb --volume /path/to/project/.monetdb:/app monetdb/monetdb
```
*
Connect to the container using:
```
$ docker exec -it monetdb /bin/bash
```
*
In the docker shell, run the following commands:
```
# monetdb create oldduck
# monetdb release oldduck
# monetdb start oldduck
```
index.py
View file @
755e1489
...
@@ -183,7 +183,7 @@ class DuckDBIndex(Index):
...
@@ -183,7 +183,7 @@ class DuckDBIndex(Index):
docs
.
append
({
docs
.
append
({
'docid'
:
docid
,
'docid'
:
docid
,
'name'
:
document
[
'name'
],
'name'
:
document
[
'name'
],
'length'
:
len
(
doc_terms
),
'length'
:
sum
(
doc_terms
.
values
()
),
})
})
amount_of_digits
=
math
.
floor
(
math
.
log10
(
len
(
data
)))
+
1
amount_of_digits
=
math
.
floor
(
math
.
log10
(
len
(
data
)))
+
1
...
@@ -241,12 +241,15 @@ class MonetDBIndex(Index):
...
@@ -241,12 +241,15 @@ class MonetDBIndex(Index):
hostname
=
'localhost'
,
database
=
db
).
cursor
()
hostname
=
'localhost'
,
database
=
db
).
cursor
()
self
.
cursor
.
execute
(
'CREATE TABLE IF NOT EXISTS dict('
self
.
cursor
.
execute
(
'CREATE TABLE IF NOT EXISTS dict('
'termid INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,'
# 'termid INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,'
'term VARCHAR(32) NOT NULL UNIQUE,'
'termid INTEGER NOT NULL,'
# 'term VARCHAR(32) NOT NULL UNIQUE,'
'term VARCHAR(64) NOT NULL,'
'df INTEGER NOT NULL)'
)
'df INTEGER NOT NULL)'
)
self
.
cursor
.
execute
(
'CREATE TABLE IF NOT EXISTS docs('
self
.
cursor
.
execute
(
'CREATE TABLE IF NOT EXISTS docs('
'docid INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,'
# 'docid INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,'
'name VARCHAR(32) NOT NULL,'
'docid INTEGER NOT NULL,'
'name VARCHAR(64) NOT NULL,'
'length INTEGER NOT NULL)'
)
'length INTEGER NOT NULL)'
)
self
.
cursor
.
execute
(
'CREATE TABLE IF NOT EXISTS terms('
self
.
cursor
.
execute
(
'CREATE TABLE IF NOT EXISTS terms('
'termid INTEGER NOT NULL,'
'termid INTEGER NOT NULL,'
...
@@ -323,7 +326,7 @@ class MonetDBIndex(Index):
...
@@ -323,7 +326,7 @@ class MonetDBIndex(Index):
docs
.
append
({
docs
.
append
({
'docid'
:
docid
,
'docid'
:
docid
,
'name'
:
document
[
'name'
],
'name'
:
document
[
'name'
],
'length'
:
len
(
doc_terms
),
'length'
:
sum
(
doc_terms
.
values
()
),
})
})
amount_of_digits
=
math
.
floor
(
math
.
log10
(
len
(
data
)))
+
1
amount_of_digits
=
math
.
floor
(
math
.
log10
(
len
(
data
)))
+
1
...
@@ -334,17 +337,15 @@ class MonetDBIndex(Index):
...
@@ -334,17 +337,15 @@ class MonetDBIndex(Index):
dict_table
[
'termid'
]
=
dict_table
.
index
dict_table
[
'termid'
]
=
dict_table
.
index
dict_table
=
dict_table
[[
'termid'
,
'term'
,
'df'
]]
dict_table
=
dict_table
[[
'termid'
,
'term'
,
'df'
]]
dict_table
.
to_csv
(
'dict.csv'
,
header
=
False
,
index
=
False
)
dict_table
.
to_csv
(
'
.monetdb/
dict.csv'
,
header
=
False
,
index
=
False
)
doc_table
.
to_csv
(
'docs.csv'
,
header
=
False
,
index
=
False
)
doc_table
.
to_csv
(
'
.monetdb/
docs.csv'
,
header
=
False
,
index
=
False
)
term_table
.
to_csv
(
'terms.csv'
,
header
=
False
,
index
=
False
)
term_table
.
to_csv
(
'
.monetdb/
terms.csv'
,
header
=
False
,
index
=
False
)
for
table
in
(
'dict'
,
'docs'
,
'terms'
):
for
table
in
(
'dict'
,
'docs'
,
'terms'
):
filename
=
os
.
path
.
abspath
(
f
'
{
table
}
.csv'
)
self
.
cursor
.
execute
(
f
'DELETE FROM
{
table
}
'
)
self
.
cursor
.
execute
(
f
'DELETE FROM
{
table
}
'
)
self
.
cursor
.
execute
(
f
"COPY INTO
{
table
}
FROM '
{
filename
}
'"
)
self
.
cursor
.
execute
(
f
"COPY INTO
{
table
}
FROM '
/app/
{
table
}
.csv' USING DELIMITERS ',
'"
)
os
.
remove
(
f
'
{
table
}
.csv'
)
os
.
remove
(
f
'
.monetdb/
{
table
}
.csv'
)
self
.
cursor
.
execute
(
'COMMIT'
)
self
.
cursor
.
execute
(
'COMMIT'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment