Commit 5f998314 authored by Camil Staps's avatar Camil Staps 🐧

Unnamed capture groups

parent 8e44ddf6
......@@ -74,6 +74,7 @@ The following BNF grammar is recognised by `compile`:
| <Regex> <Quantifier> ['?']
| <Regex> <Regex>
| <Regex> '|' <Regex>
| '(' <Regex> ')'
| '(?:' <Regex> ')'
| <Literal>
......@@ -110,9 +111,9 @@ The following BNF grammar is recognised by `compile`:
## Todo (in order of importance)
* Mode modifiers (`i`, `s`, `m`)
* Capturing groups
* Backreferences
* More functions for a convenient interface
* Named capture groups
* Backreferences
* More anchors (`\A`, `\Z`)
[camilstaps]: https://camilstaps.nl
......
......@@ -12,6 +12,7 @@ import Regex.Print
| Concat [Regex]
| Any [Regex]
| Repeated Greediness Int (Maybe Int) Regex
| Group GroupId Regex
| StartOfString
| EndOfString
| WordBreak Bool
......@@ -22,3 +23,5 @@ Optional g :== Repeated g 0 (Just 1)
:: Greediness :== Bool
:: Position :== Int
:: GroupId = Named String | NotNamed Int
definition module Regex.Match
from Regex import :: Regex, :: Position
from Regex import :: Regex, :: Position, :: GroupId
match :: !Regex [Char] -> [(Position, [Char])]
:: Match :== (Position, [Char], [(GroupId, [Char])])
match :: !Regex [Char] -> [Match]
......@@ -15,14 +15,15 @@ import Regex
, matched :: [Char]
, unseen :: [Char]
, can_skip :: Bool
, groups :: [(GroupId, [Char])]
}
instance zero MatchStatus
where zero = {skipped=[], matched=[], unseen=[], can_skip=True}
where zero = {skipped=[], matched=[], unseen=[], can_skip=True, groups=[]}
match :: !Regex [Char] -> [(Position, [Char])]
match r s = [(length st.skipped, st.matched) \\ st <- match` r status]
where status = {zero & unseen=s}
match :: !Regex [Char] -> [Match]
match r s = [(length st.skipped, st.matched, st.groups) \\ st <- match` r stat]
where stat = {zero & unseen=s}
match` :: Regex MatchStatus -> [MatchStatus]
match` r=:(Literal cs) st
......@@ -48,6 +49,9 @@ match` (Repeated g f t r) st
= match` (Concat $ repeatn f r) st
= match` (Concat $ repeatn f r ++ [Any $ if g id reverse opts]) st
where opts = [Concat [r, Repeated g 0 (flip (-) (f+1) <$> t) r], Concat []]
match` (Group id r) st
= [{st` & groups=st`.groups ++ [(id,drop (length st.matched) st`.matched)]}
\\ st` <- match` r st]
match` StartOfString st
= if (isEmpty st.matched) [{st & can_skip=False}] []
match` EndOfString st
......
......@@ -17,42 +17,45 @@ import Regex
import Regex.Util
compile :: [Char] -> Maybe Regex
compile cs = compile` [] cs
compile cs = compile` 0 [] cs
where
compile` :: [Regex] [Char] -> Maybe Regex
compile` [] [] = Just $ Concat []
compile` [r] [] = Just r
compile` rs [] = Just $ Concat $ reverse rs
compile` :: Int [Regex] [Char] -> Maybe Regex
compile` _ [] [] = Just $ Concat []
compile` _ [r] [] = Just r
compile` _ rs [] = Just $ Concat $ reverse rs
where
itlt :: [a] -> ([a], a)
itlt [x] = ([], x)
itlt [x:xs] = let (it,lt) = itlt xs in ([x:it], lt)
compile` rs cs = parse rs cs >>= uncurry compile`
parse :: [Regex] [Char] -> Maybe ([Regex], [Char])
parse rs [] = Just (rs, [])
parse [r:rs] ['+':'?':cs] = Just ([Some False r:rs], cs)
parse [r:rs] ['+':cs] = Just ([Some True r:rs], cs)
parse [r:rs] ['*':'?':cs] = Just ([Many False r:rs], cs)
parse [r:rs] ['*':cs] = Just ([Many True r:rs], cs)
parse [r:rs] ['?':'?':cs] = Just ([Optional False r:rs], cs)
parse [r:rs] ['?':cs] = Just ([Optional True r:rs], cs)
parse [Any rs:rs`] ['|':cs] = (\(r,cs)->([Any $ rs++r:rs`],cs)) <$> parse [] cs
parse [r:rs] ['|':cs] = (\(r`,cs)->([Any [r:r`]:rs],cs)) <$> parse [] cs
parse _ ['|':_] = Nothing
parse [r:rs] ['{':cs]
compile` g rs cs = parse g rs cs >>= uncurry3 compile`
parse :: Int [Regex] [Char] -> Maybe (Int, [Regex], [Char])
parse g rs [] = Just (g, rs, [])
parse g [r:rs] ['+':'?':cs] = Just (g, [Some False r:rs], cs)
parse g [r:rs] ['+':cs] = Just (g, [Some True r:rs], cs)
parse g [r:rs] ['*':'?':cs] = Just (g, [Many False r:rs], cs)
parse g [r:rs] ['*':cs] = Just (g, [Many True r:rs], cs)
parse g [r:rs] ['?':'?':cs] = Just (g, [Optional False r:rs], cs)
parse g [r:rs] ['?':cs] = Just (g, [Optional True r:rs], cs)
parse g [Any rs:rs`] ['|':cs]
= (\(g,r,cs) -> (g,[Any $ rs++r:rs`],cs)) <$> parse g [] cs
parse g [r:rs] ['|':cs]
= (\(g,r`,cs) -> (g,[Any [r:r`]:rs],cs)) <$> parse g [] cs
parse g _ ['|':_]
= Nothing
parse g [r:rs] ['{':cs]
| isNothing fr = Nothing
| hd cs` == '}'
| length cs` > 1 && cs`!!1 == '?'
= Just ([Repeated False fr` (Just fr`) r:rs], drop 2 cs`)
= Just ([Repeated True fr` (Just fr`) r:rs], tl cs`)
= Just (g, [Repeated False fr` (Just fr`) r:rs], drop 2 cs`)
= Just (g, [Repeated True fr` (Just fr`) r:rs], tl cs`)
| isNothing to = Nothing
| hd cs` == ',' && hd cs`` == '}'
| length cs`` > 1 && cs``!!1 == '?'
= Just ([Repeated False fr` to` r:rs], drop 2 cs``)
= Just ([Repeated True fr` to` r:rs], tl cs``)
= Just (g, [Repeated False fr` to` r:rs], drop 2 cs``)
= Just (g, [Repeated True fr` to` r:rs], tl cs``)
| otherwise = Nothing
where
(fr`, to`) = (fromJust fr, fromJust to)
......@@ -67,26 +70,29 @@ where
toEnd [] = Just Nothing
toEnd cs = Just <$> parseInt cs
parse rs ['[':'^':cs]
= appFst (\cc -> [CharacterClass True cc:rs]) <$> charClass [] cs
parse rs ['[':cs]
= appFst (\cc -> [CharacterClass False cc:rs]) <$> charClass [] cs
parse g rs ['[':'^':cs]
= (\(cc,cs) -> (g, [CharacterClass True cc:rs], cs)) <$> charClass [] cs
parse g rs ['[':cs]
= (\(cc,cs) -> (g, [CharacterClass False cc:rs], cs)) <$> charClass [] cs
parse rs ['(':'?':':':cs] = case parse` [] cs of
parse g rs ['(':'?':':':cs] = case inGroup g [] cs of
Nothing = Nothing
(Just ([], cs)) = Just (rs, cs)
(Just ([r], cs)) = Just ([r:rs], cs)
(Just (rs`, cs)) = Just ([Concat $ reverse rs`:rs], cs)
where
parse` :: [Regex] [Char] -> Maybe ([Regex], [Char])
parse` grp [] = Nothing
parse` grp [')':cs] = Just (grp, cs)
parse` grp cs = parse grp cs >>= uncurry parse`
(Just (g, [], cs)) = Just (g, rs, cs)
(Just (g, [r], cs)) = Just (g, [r:rs], cs)
(Just (g, rs`, cs)) = Just (g, [Concat $ reverse rs`:rs], cs)
parse g rs ['(':cs] = (\(g`,rs`,cs) -> (g`, [Group (NotNamed g) (cc rs`):rs], cs)) <$> inGroup (g+1) [] cs
where cc [r] = r; cc rs = Concat $ reverse rs
parse rs cs
= (\(c,cs) -> ([CharacterClass False c:rs],cs)) <$> shorthandClass cs
<|> (\(a,cs) -> ([a:rs], cs)) <$> anchor cs
<|> (\(c,cs) -> ([Literal [c]:rs], cs)) <$> singleChar cs
parse g rs cs
= (\(c,cs) -> (g, [CharacterClass False c:rs],cs)) <$> shorthandClass cs
<|> (\(a,cs) -> (g, [a:rs], cs)) <$> anchor cs
<|> (\(c,cs) -> (g, [Literal [c]:rs], cs)) <$> singleChar cs
inGroup :: Int [Regex] [Char] -> Maybe (Int, [Regex], [Char])
inGroup g grp [] = Nothing
inGroup g grp [')':cs] = Just (g, grp, cs)
inGroup g grp cs = parse g grp cs >>= uncurry3 inGroup
anchor :: [Char] -> Maybe (Regex, [Char])
anchor ['^':cs] = Just (StartOfString, cs)
......@@ -136,3 +142,6 @@ shorthandClass :: [Char] -> Maybe ([(Char,Char)], [Char])
shorthandClass ['\\':c:cs] = (\cc -> (cc,cs)) <$> lookup c shorthand_classes
shorthandClass ['.':cs] = Just ([('\x00','\x09'),('\x0b','\xff')], cs)
shorthandClass _ = Nothing
uncurry3 :: (a b c -> d) (a,b,c) -> d
uncurry3 f (x,y,z) = f x y z
module example
import StdEnv
from Data.Func import $
import Data.Functor
import Data.Maybe
import Data.Tuple
import Regex
Start = map (appSnd toString) <$> flip match input <$> compile rgx
Start = map (appSnd3 toString o (appThd3 $ map (appSnd toString)))
<$> flip match input <$> compile rgx
where
rgx = ['[a-zA-Z\\d][\\w\\.%+-]*@(?:[a-zA-Z\\d-]+\\.)+[a-zA-Z]{2,}']
rgx = ['^([a-zA-Z\\d][\\w\\.%+-]*)@([a-zA-Z\\d-]+\\.)+[a-zA-Z]{2,}']
input = ['info@camilstaps.nl']
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment