Verified Commit 8fc209cc authored by Camil Staps's avatar Camil Staps 🚀

Word boundary anchors

parent 1fd508ea
......@@ -23,10 +23,10 @@ The following regex features are supported:
* Repetition: `?`, `+`, `*`, `{n}`, `{n,}`, `{m,n}` (all with lazy variants)
* Character classes: `[..]` or `[^..]` where `..` contains characters,
character ranges and shorthand character classes
* Anchors: `^` and `$`
* Anchors: `^`, `$`, `\b`, `\B`
* Shorthand character classes: `.`, `\d`, `\D`, `\w`, `\W`, `\s`, `\S`
* Escape sequences: `\a`, `\b`, `\f`, `\n`, `\r`, `\t`, `\v`, `\\`, `\nnn`
(octal) and `\xhh` (hexadecimal)
* Escape sequences: `\a`, `\f`, `\n`, `\r`, `\t`, `\v`, `\\`, `\nnn`
(octal), `\xhh` (hexadecimal)
## Example (`test.icl`)
......@@ -101,11 +101,11 @@ The following BNF grammar is recognised by `compile`:
## Todo (in order of importance)
* More anchors (`\b`, `\B`, `\A`, `\Z`)
* Mode modifiers (`i`, `s`, `m`)
* Capturing groups
* Backreferences
* More functions for a convenient interface
* More anchors (`\A`, `\Z`)
[camilstaps]: https://camilstaps.nl
[Clean]: http://clean.cs.ru.nl
......@@ -14,6 +14,7 @@ import Regex.Print
| Repeated Greediness Int (Maybe Int) Regex
| StartOfString
| EndOfString
| WordBreak Bool
Many g :== Repeated g 0 Nothing
Some g :== Repeated g 1 Nothing
......
......@@ -52,6 +52,19 @@ match` StartOfString st
= if (isEmpty st.matched) [{st & can_skip=False}] []
match` EndOfString st
= if (isEmpty st.unseen) [st] []
match` r=:(WordBreak n) st
= matchAndContinue r st $ if (if n not id atWordBreak) [st`] []
where
st` = {st & can_skip=False}
atWordBreak
| isEmpty st.skipped && isEmpty st.matched = nextIsWord
| isEmpty st.matched = isWordChar (last st.skipped) <> nextIsWord
| otherwise = isWordChar (last st.matched) <> nextIsWord
nextIsWord = not (isEmpty st.unseen) && isWordChar (hd st.unseen)
isWordChar c = ('A' <= c && c <= 'Z')
|| ('a' <= c && c <= 'z')
|| ('0' <= c && c <= '9')
|| c == '_'
skip :: Int MatchStatus -> MatchStatus
skip n st
......
......@@ -31,8 +31,6 @@ where
parse :: [Regex] [Char] -> Maybe ([Regex], [Char])
parse rs [] = Just (rs, [])
parse rs ['^':cs] = Just ([StartOfString:rs], cs)
parse rs ['$':cs] = Just ([EndOfString:rs], cs)
parse [r:rs] ['+':'?':cs] = Just ([Some False r:rs], cs)
parse [r:rs] ['+':cs] = Just ([Some True r:rs], cs)
parse [r:rs] ['*':'?':cs] = Just ([Many False r:rs], cs)
......@@ -87,8 +85,16 @@ where
parse rs cs
= (\(c,cs) -> ([CharacterClass False c:rs],cs)) <$> shorthandClass cs
<|> (\(a,cs) -> ([a:rs], cs)) <$> anchor cs
<|> (\(c,cs) -> ([Literal [c]:rs], cs)) <$> singleChar cs
anchor :: [Char] -> Maybe (Regex, [Char])
anchor ['^':cs] = Just (StartOfString, cs)
anchor ['$':cs] = Just (EndOfString, cs)
anchor ['\\':'b':cs] = Just (WordBreak False, cs)
anchor ['\\':'B':cs] = Just (WordBreak True, cs)
anchor _ = Nothing
singleChar :: [Char] -> Maybe (Char, [Char])
singleChar ['\\':'x':c1:c2:cs]
= Just (toChar $ 16 * fromHex c1 + fromHex c2, cs)
......
......@@ -2,7 +2,6 @@ definition module Regex.Util
escape_sequences :==
[ ('a', '\x07')
, ('b', '\x08')
, ('f', '\x0c')
, ('n', '\x0a')
, ('r', '\x0d')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment