Verified Commit 798d7902 authored by Camil Staps's avatar Camil Staps 🚀

Add Data.NGramIndex

parent 52347372
Pipeline #11237 passed with stage
in 1 minute and 41 seconds
definition module Data.NGramIndex
/**
* *n*-grams are sequences of *n* characters. The *n*-gram index maps *n*-grams
* to sets of values. This is for instance useful in search engines.
*/
from StdClass import class Eq, class ==, class Ord, class <
from Data.Map import :: Map
from Data.Maybe import :: Maybe
/**
* The *n*-gram index maps character *n*-grams to sets of values.
*/
:: NGramIndex v =
{ n :: !Int //* The parameter *n* for the size of the grams
, ci :: !Bool //* Whether matching is case-insensitive
, idx :: !Map [Char] [v] //* The values
}
/**
* Create a new {{`NGramIndex`}}.
* @param The parameter *n*
* @param Whether the index should be case insensitive
*/
newNGramIndex :: !Int !Bool -> NGramIndex v
/**
* Get the size (the number of grams, not the number of values) of an
* {{`NGramIndex`}}.
*/
ngramSize :: !(NGramIndex v) -> Int
/**
* Add a certain value with a certain key to an index.
* @param The key. For all *n*-grams of the key, the value will be added to the
* index
* @param The value
*/
index :: !String !v !(NGramIndex v) -> NGramIndex v | Eq v
/**
* Search for a key in the index.
* @param The key. For all *n*-grams of the key, the values will be returned
* @result For each matching value, a tuple of the value and the number of
* matching *n*-grams is returned
*
*/
search :: !String !(NGramIndex v) -> [(v,Int)] | Eq, Ord v
/**
* Get the *n*-grams of a string.
* @param Whether this should be done case insensitively
* @param The parameter *n*
* @param The string
*/
ngrams :: !Bool !Int !String -> [[Char]]
implementation module Data.NGramIndex
import _SystemArray
import StdChar
from StdFunc import flip, o
import StdInt
from StdList import filter, flatten, isMember, map, removeDup, span, take, ++,
instance length [], instance == [a], instance < [a], instance fromString [Char]
import StdOrdList
from Data.Func import $
from Data.List import concatMap, tails
import Data.Map
import Data.Maybe
import Data.Monoid
newNGramIndex :: !Int !Bool -> NGramIndex v
newNGramIndex n ci = {n=n, ci=ci, idx=newMap}
ngramSize :: !(NGramIndex v) -> Int
ngramSize ngi = mapSize ngi.idx
index :: !String !v !(NGramIndex v) -> NGramIndex v | Eq v
index s v ngi=:{n,ci,idx} = {ngi & idx=foldr add idx (ngrams` ci n s)}
where
add = alter \vs -> Just case vs of
Nothing -> [v]
Just vs -> if (isMember v vs) vs [v:vs]
ngrams` ci n s = flatten [ngrams ci i s \\ i <- [1..n]]
search :: !String !(NGramIndex v) -> [(v,Int)] | Eq, Ord v
search s {n,ci,idx} = count
$ foldr merge []
$ map (fromMaybe [] o flip get idx)
$ if (size s >= n) (ngrams ci n s) [map toLower $ fromString s]
where
count :: [v] -> [(v,Int)] | == v
count [] = []
count [x:xs] = [(x,length yes + 1):count no]
where
(yes,no) = span ((==) x) xs
ngrams :: !Bool !Int !String -> [[Char]]
ngrams ci n s = removeDup
$ filter ((==) n o length)
$ map (take n)
$ tails
$ if ci (map toLower) id
$ fromString s
......@@ -83,6 +83,7 @@ import qualified Data.MapCollection
import qualified Data.Matrix
import qualified Data.Maybe
import qualified Data.Monoid
import qualified Data.NGramIndex
import qualified Data.OrdList
import qualified Data.Queue
import qualified Data.Set
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment