Commit c49f1a9c authored by Camil Staps's avatar Camil Staps 🐧

Negated & shorthand character classes; escaping

parent 89d50359
......@@ -4,7 +4,7 @@ CLMFLAGS:=\
-I $$CLEAN_HOME/lib/Generics\
-nt
DEPMODS:=Regex Regex/Match Regex/Parse Regex/Print
DEPMODS:=Regex Regex/Match Regex/Parse Regex/Print Regex/Util
DEPS:=$(addsuffix .dcl,$(DEPMODS)) $(addsuffix .icl,$(DEPMODS))
EXE:=test
......
......@@ -8,7 +8,7 @@ import Regex.Print
:: Regex
= Literal [Char]
| CharacterClass [(Char,Char)]
| CharacterClass Bool [(Char,Char)]
| Concat [Regex]
| Any [Regex]
| Repeated Greediness Int (Maybe Int) Regex
......
......@@ -28,11 +28,11 @@ match` :: Regex MatchStatus -> [MatchStatus]
match` r=:(Literal cs) st
= matchAndContinue r st $
if (isPrefixOf cs st.unseen) [eat (length cs) st] []
match` (CharacterClass _) {unseen=[]}
match` (CharacterClass _ _) {unseen=[]}
= []
match` r=:(CharacterClass cs) st=:{matched,unseen=[u:us]}
match` r=:(CharacterClass n cs) st=:{matched,unseen=[u:us]}
= matchAndContinue r st $
if (any (\(f,t) -> f <= u && u <= t) cs) [eat 1 st] []
if (if n not id $ any (\(f,t) -> f <= u && u <= t) cs) [eat 1 st] []
match` (Concat []) st
= [st]
match` (Concat [r]) st
......
......@@ -9,10 +9,12 @@ import Control.Applicative
import Control.Monad
from Data.Func import $
import Data.Functor
import Data.List
import Data.Maybe
import Data.Tuple
import Regex
import Regex.Util
compile :: [Char] -> Maybe Regex
compile cs = compile` [] cs
......@@ -63,14 +65,10 @@ where
toEnd [] = Just Nothing
toEnd cs = Just <$> parseInt cs
parse rs ['[':cs] = appFst (\cc -> [CharacterClass cc:rs]) <$> charClass [] cs
where
charClass :: [(Char,Char)] [Char] -> Maybe ([(Char,Char)], [Char])
charClass _ [] = Nothing
charClass cls [']':cs] = Just (cls,cs)
charClass cls [c:'-':']':cs] = Just (cls ++ [(c,c),('-','-')], cs)
charClass cls [c:'-':c2:cs] = charClass (cls ++ [(c,c2)]) cs
charClass cls [c:cs] = charClass (cls ++ [(c,c)]) cs
parse rs ['[':'^':cs]
= appFst (\cc -> [CharacterClass True cc:rs]) <$> charClass [] cs
parse rs ['[':cs]
= appFst (\cc -> [CharacterClass False cc:rs]) <$> charClass [] cs
parse rs ['(':'?':':':cs] = case parse` [] cs of
Nothing = Nothing
......@@ -82,6 +80,48 @@ where
parse` grp [] = Nothing
parse` grp [')':cs] = Just (grp, cs)
parse` grp cs = parse grp cs >>= uncurry parse`
parse rs [c:cs]
= Just ([Literal [c]:rs], cs)
parse _ _ = Nothing
parse rs cs
= (\(c,cs) -> ([CharacterClass False c:rs],cs)) <$> shorthandClass cs
<|> (\(c,cs) -> ([Literal [c]:rs], cs)) <$> singleChar cs
singleChar :: [Char] -> Maybe (Char, [Char])
singleChar ['\\':'x':c1:c2:cs]
= Just (toChar $ 16 * fromHex c1 + fromHex c2, cs)
where fromHex c = length $ takeWhile ((<>) c) ['0123456789abcdef']
singleChar ['\\':c:cs]
| isMember c escapable
= Just (c, cs)
| isMember c $ map fst escape_sequences
= Just (snd $ hd $ filter ((==) c o fst) escape_sequences, cs)
| isOctDigit c
= Just (toChar $ foldl ((+) o ((*) 8)) 0 $ map digitToInt octs, nonocts)
| otherwise
= Nothing
where
escapable = ['\\+*?()[]{}^$.']
(octs,nonocts) = spanMax 3 isOctDigit [c:cs]
spanMax :: Int (a -> Bool) [a] -> ([a], [a])
spanMax 0 _ xs = ([], xs)
spanMax _ _ [] = ([], [])
spanMax n f [x:xs] = if (f x) ([x:ys],zs) ([],[x:xs])
where (ys,zs) = spanMax (n-1) f xs
singleChar [c:cs]
= Just (c,cs)
charClass :: [(Char,Char)] [Char] -> Maybe ([(Char,Char)], [Char])
charClass _ [] = Nothing
charClass cls [']':cs] = Just (cls,cs)
charClass cls cs = (try_chars <|> shorthandClass cs) >>=
\(cls`,cs`) -> charClass (cls ++ cls`) cs`
where
try_chars = singleChar cs >>= \(c1,cs`) -> case cs` of
['-':cs``] = (\(c2,cs) -> ([(c1,c2)],cs)) <$> singleChar cs``
_ = Just ([(c1,c1)], cs`)
shorthandClass :: [Char] -> Maybe ([(Char,Char)], [Char])
shorthandClass ['\\':c:cs] = (\cc -> (cc,cs)) <$> lookup c shorthand_classes
shorthandClass ['.':cs] = Just ([('\0','\255')], cs)
shorthandClass _ = Nothing
......@@ -2,17 +2,26 @@ implementation module Regex.Print
import _SystemArray
import StdBool
from StdFunc import flip
import StdString
import StdTuple
from Data.Func import $
import Data.List
import Data.Maybe
import Regex
import Regex.Util
print :: Bool Regex -> String
print ps (Literal cs) = parens (ps && length cs > 1) cs
print ps (CharacterClass cs) = "[" <+ foldl (<++) "" cs <+ "]"
print ps (Literal cs) = parens (ps && length cs > 1) $ foldl (+++) "" $ map escape cs
print ps (CharacterClass True cc) = foldl (+++) "[^" (printCharClasses cc) +++ "]"
print ps (CharacterClass False cc)
| length ccs == 1 = hd ccs
| otherwise = foldl (+++) "[" ccs +++ "]"
where ccs = printCharClasses cc
print ps (CharacterClass False [('\0','\255')]) = "."
print ps (CharacterClass n cs) = "[" <+ if n "^" "" <+ foldl (<++) "" cs <+ "]"
where
(<++) infixr 5 :: String (Char,Char) -> String
(<++) s (c1,c2)
......@@ -22,11 +31,11 @@ print ps (Concat rgxs) = parens ps $ foldl (\x s -> x +++ s) r rs
where [r:rs] = map (print True) rgxs
print ps (Any rgxs) = parens ps $ foldl (\x s -> x +++ "|" +++ s) r rs
where [r:rs] = map (print True) rgxs
print ps (Repeated g 0 (Just 1) r) = parens ps $ print True r <+ "?" <+ lz g
print ps (Repeated g 1 (Just 1) r) = parens ps $ print True r
print ps (Repeated g 0 Nothing r) = parens ps $ print True r <+ "*" <+ lz g
print ps (Repeated g 1 Nothing r) = parens ps $ print True r <+ "+" <+ lz g
print ps (Repeated g f t r) = parens ps $ print True r <+ q <+ lz g
print ps (Repeated g 0 (Just 1) r) = print True r <+ "?" <+ lz g
print ps (Repeated g 1 (Just 1) r) = print True r
print ps (Repeated g 0 Nothing r) = print True r <+ "*" <+ lz g
print ps (Repeated g 1 Nothing r) = print True r <+ "+" <+ lz g
print ps (Repeated g f t r) = print True r <+ q <+ lz g
where
q
| isNothing t = "{" <+ f <+ ",}"
......@@ -36,7 +45,7 @@ where
print ps StartOfString = "^"
print ps EndOfString = "$"
lz :: Bool -> String // The lazy quantifier
lz :: Bool -> String
lz True = ""
lz False = "?"
......@@ -48,3 +57,33 @@ parens False s = toString s
(<+) infixr 5 :: a b -> String | toString a & toString b
(<+) x y = toString x +++ toString y
printCharClasses :: [(Char,Char)] -> [String]
printCharClasses [] = []
printCharClasses cs = case findShorthand shorthand_classes cs of
(Just (n,cs`)) = [n:printCharClasses cs`]
Nothing = map printRange cs
where
findShorthand :: [(Char, [(Char,Char)])] [(Char,Char)] -> Maybe (String, [(Char,Char)])
findShorthand [] _ = Nothing
findShorthand [(n,ndl):shts] hay
| all (flip isMember hay) ndl = Just ({'\\',n}, foldr removeMember hay ndl)
| otherwise = findShorthand shts hay
printRange :: (Char,Char) -> String
printRange ('\0','\255') = "."
printRange (c1,c2)
| c1 == c2 = c1`
| otherwise = c1` +++ "-" +++ c2`
where (c1`,c2`) = (escape c1, escape c2)
escape :: Char -> String
escape c
| isMember c $ map snd escape_sequences
= hd [{'\\',n} \\ (n,c`) <- escape_sequences | c == c`]
| c < ' ' || c > '~'
= {'\\', 'x', hex (toInt c / 16), hex (toInt c rem 16)}
| otherwise
= {c}
where
hex i = "0123456789abcdef".[i]
definition module Regex.Util
escape_sequences :==
[ ('a', '\a')
, ('b', '\b')
, ('f', '\f')
, ('n', '\n')
, ('r', '\r')
, ('t', '\t')
, ('v', '\v')
, ('\\', '\\')
]
shorthand_classes :==
[ ('d', [('0','9')])
, ('D', [('\0','/'),(':','\255')])
, ('w', [('A','Z'),('a','z'),('0','9'),('_','_')])
, ('W', [('\0','/'),(':','@'),('[','^'),('`','`'),('{','\255')])
, ('s', [(' ',' '),('\t','\n'),('\f','\r')])
, ('S', [('\0','\8'),('\11','\11'),('\14','\31'),('!','\255')])
]
implementation module Regex.Util
......@@ -6,5 +6,5 @@ import Regex
Start = (toString rgx, match rgx string)
where
(Just rgx) = compile ['[e-hl]{1,2}?l']
(Just rgx) = compile ['\\s.*[ld]']
string = ['hello world']
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment