Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Gijs Hendriksen
Information Retrieval
Commits
f8f1c0a3
Commit
f8f1c0a3
authored
Nov 13, 2019
by
Gijs Hendriksen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Complete indexing step, add wrapper for index operations
parent
402b2c93
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
96 additions
and
31 deletions
+96
-31
.gitignore
.gitignore
+1
-0
index.py
index.py
+57
-31
main.py
main.py
+38
-0
No files found.
.gitignore
View file @
f8f1c0a3
.idea/
.idea/
__pycache__/
*.db
*.db
*.db.wal
*.db.wal
index.py
View file @
f8f1c0a3
import
duckdb
import
duckdb
import
os
import
os
from
collections
import
defaultdict
class
Index
:
class
Index
:
def
__init__
(
self
,
db
):
def
__init__
(
self
,
db
):
db_exists
=
os
.
path
.
exists
(
db
)
self
.
cursor
=
duckdb
.
connect
(
db
).
cursor
()
self
.
cursor
=
duckdb
.
connect
(
db
).
cursor
()
self
.
stopwords
=
[]
self
.
stopwords
=
[
'i'
,
'on'
,
'my'
,
'and'
,
'in'
,
'the'
]
if
os
.
path
.
exists
(
db
):
if
db_exists
:
os
.
remove
(
db
)
self
.
reset_auto_increment
()
else
:
self
.
init_db
()
self
.
create_table
()
def
reset_auto_increment
(
self
):
max_termid
=
self
.
cursor
.
execute
(
'SELECT MAX(termid) FROM terms'
).
fetchone
()[
0
]
max_termid
=
1
if
max_termid
is
None
else
max_termid
+
1
# def init_db(self):
max_docid
=
self
.
cursor
.
execute
(
'SELECT MAX(docid) FROM docs'
).
fetchone
()[
0
]
# max_termid = self.cursor.execute('SELECT MAX(termid) + 1 FROM terms').fetchone()[0]
max_docid
=
1
if
max_docid
is
None
else
max_docid
+
1
# max_termid = 1 if max_termid is None else max_termid[0]
#
# max_docid = self.cursor.execute('SELECT MAX(docid) + 1 FROM docs').fetchone()[0]
# max_docid = 1 if max_docid is None else max_docid[0]
#
# self.cursor.execute()
# self.cursor.execute(f'CREATE SEQUENCE term_ids START WITH {max_termid}')
# self.cursor.execute(f'CREATE SEQUENCE doc_ids START WITH ({max_docid})')
def
create_table
(
self
):
self
.
cursor
.
execute
(
'DROP SEQUENCE term_ids'
)
self
.
cursor
.
execute
(
'DROP SEQUENCE doc_ids'
)
self
.
cursor
.
execute
(
f
'CREATE SEQUENCE term_ids START WITH
{
max_termid
}
'
)
self
.
cursor
.
execute
(
f
'CREATE SEQUENCE doc_ids START WITH
{
max_docid
}
'
)
def
init_db
(
self
):
self
.
cursor
.
execute
(
f
'CREATE SEQUENCE term_ids'
)
self
.
cursor
.
execute
(
f
'CREATE SEQUENCE term_ids'
)
self
.
cursor
.
execute
(
f
'CREATE SEQUENCE doc_ids'
)
self
.
cursor
.
execute
(
f
'CREATE SEQUENCE doc_ids'
)
self
.
cursor
.
execute
(
'CREATE TABLE dict('
self
.
cursor
.
execute
(
'CREATE TABLE dict('
'termid INTEGER NOT NULL,'
'termid INTEGER NOT NULL,'
'term VARCHAR NOT NULL,'
'term VARCHAR NOT NULL,'
'df INTEGER NOT NULL,'
'df INTEGER NOT NULL)'
)
'PRIMARY KEY (termid))'
)
self
.
cursor
.
execute
(
'CREATE TABLE docs('
self
.
cursor
.
execute
(
'CREATE TABLE docs('
'docid INTEGER NOT NULL,'
'docid INTEGER NOT NULL,'
'name VARCHAR NOT NULL,'
'name VARCHAR NOT NULL,'
'length INTEGER NOT NULL,'
'length INTEGER NOT NULL)'
)
'PRIMARY KEY (docid))'
)
self
.
cursor
.
execute
(
'CREATE TABLE terms('
self
.
cursor
.
execute
(
'CREATE TABLE terms('
'termid INTEGER NOT NULL,'
'termid INTEGER NOT NULL,'
'docid INTEGER NOT NULL,'
'docid INTEGER NOT NULL,'
'count INTEGER NOT NULL)'
)
'count INTEGER NOT NULL)'
)
def
get_terms
(
self
,
body
):
def
get_terms
(
self
,
body
):
terms
=
body
.
split
()
terms
=
defaultdict
(
int
)
return
[
term
.
lower
()
for
term
in
terms
if
term
.
lower
()
not
in
self
.
stopwords
]
for
term
in
body
.
lower
().
split
():
if
term
not
in
self
.
stopwords
:
terms
[
term
]
+=
1
return
terms
def
index
(
self
,
document
):
def
index
(
self
,
document
):
terms
=
self
.
get_terms
(
document
[
'body'
])
terms
=
self
.
get_terms
(
document
[
'body'
])
for
term
in
terms
:
doc_name
=
document
[
'name'
]
doc_length
=
len
(
terms
)
doc_id
=
self
.
cursor
.
execute
(
"SELECT nextval('doc_ids')"
).
fetchone
()[
0
]
self
.
cursor
.
execute
(
f
"INSERT INTO docs VALUES (
{
doc_id
}
, '
{
doc_name
}
',
{
doc_length
}
)"
)
for
term
,
frequency
in
terms
.
items
():
term_id
=
self
.
cursor
.
execute
(
f
"SELECT termid FROM dict WHERE term = '
{
term
}
'"
).
fetchone
()
term_id
=
self
.
cursor
.
execute
(
f
"SELECT termid FROM dict WHERE term = '
{
term
}
'"
).
fetchone
()
print
(
term_id
)
if
term_id
is
None
:
if
term_id
is
None
:
self
.
cursor
.
execute
(
f
"INSERT INTO dict VALUES (nextval('term_ids'), '
{
term
}
', 1)"
)
term_id
=
self
.
cursor
.
execute
(
"SELECT nextval('term_ids')"
).
fetchone
()[
0
]
self
.
cursor
.
execute
(
f
"INSERT INTO dict VALUES (
{
term_id
}
, '
{
term
}
', 1)"
)
else
:
else
:
self
.
cursor
.
execute
(
f
"UPDATE dict SET df = df + 1 WHERE termid =
{
term_id
[
0
]
}
"
)
term_id
=
term_id
[
0
]
self
.
cursor
.
execute
(
f
"UPDATE dict SET df = df + 1 WHERE termid =
{
term_id
}
"
)
self
.
cursor
.
execute
(
f
"INSERT INTO terms VALUES (
{
term_id
}
,
{
doc_id
}
,
{
frequency
}
)"
)
def
bulk_index
(
self
,
filename
):
# TODO read data from filename and index documents
pass
print
(
self
.
cursor
.
execute
(
'SELECT * FROM dict'
).
fetchall
())
def
print_index
(
self
):
print
(
'dict'
)
dict_rows
=
self
.
cursor
.
execute
(
'SELECT * FROM dict'
).
fetchall
()
for
row
in
dict_rows
:
print
(
'
\t
'
.
join
(
map
(
str
,
row
)))
print
(
'terms'
)
term_rows
=
self
.
cursor
.
execute
(
'SELECT * FROM terms'
).
fetchall
()
for
row
in
term_rows
:
print
(
'
\t
'
.
join
(
map
(
str
,
row
)))
index
=
Index
(
'docs.db'
)
print
(
'docs'
)
index
.
index
({
doc_rows
=
self
.
cursor
.
execute
(
'SELECT * FROM docs'
).
fetchall
()
'body'
:
'I I put on my robe and wizard hat'
,
for
row
in
doc_rows
:
'name'
:
'doc1'
,
print
(
'
\t
'
.
join
(
map
(
str
,
row
)))
})
main.py
0 → 100644
View file @
f8f1c0a3
from
argparse
import
ArgumentParser
from
index
import
Index
def
bulk_index
(
index
,
args
):
filename
=
args
.
data
index
.
bulk_index
(
filename
)
def
query_index
(
index
,
args
):
query_terms
=
args
.
terms
# TODO use query terms to query index
def
main
():
parser
=
ArgumentParser
(
prog
=
'old_duck'
,
description
=
'OldDuck - A Python implementation of OldDog, using DuckDB'
)
subparsers
=
parser
.
add_subparsers
()
parser_index
=
subparsers
.
add_parser
(
'index'
)
parser_index
.
add_argument
(
'database'
,
help
=
'The database file to index the files to'
)
parser_index
.
add_argument
(
'data'
,
help
=
'The file to read and index documents from'
)
parser
.
set_defaults
(
func
=
bulk_index
)
parser_query
=
subparsers
.
add_parser
(
'query'
)
parser_query
.
add_argument
(
'database'
,
help
=
'The database file to index the files to'
)
parser_query
.
add_argument
(
'terms'
,
help
=
'The query terms'
,
nargs
=
'*'
)
parser
.
set_defaults
(
func
=
query_index
)
args
=
parser
.
parse_args
()
index
=
Index
(
args
.
database
)
args
.
func
(
index
,
args
)
if
__name__
==
'__main__'
:
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment