Commit ac8d425f authored by Steffen Michels's avatar Steffen Michels

consistent use of offsets (pointing to end of token), unify attribute value &...

consistent use of offsets (pointing to end of token), unify attribute value & char data tokens and properly decode named entity references in attribute values
parent 77c63c4e
......@@ -154,8 +154,7 @@ where
= Ok (snd (hd xmlDoc))
//Token type which is the intermediary representation during XML parsing
:: Token = TokenAttrValue !String
| TokenCharData !String
:: Token = TokenCharData !String
| TokenName !String
| TokenStartTagOpen
| TokenTagClose
......@@ -176,18 +175,15 @@ isName _ = False
isCharData (TokenCharData _) = True
isCharData _ = False
isAttrValue (TokenAttrValue _) = True
isAttrValue _ = False
:: LexFunctionResult = Token !Int !Token | NoToken !Int | Fail !String
:: LexFunction :== String Int -> Maybe LexFunctionResult
lex :: !String !Int ![Token] -> MaybeErrorString [Token]
lex input offset tokens
| offset >= size input = Ok (reverse tokens) //Done
| dataMode tokens && isJust charDataResult = processResult (fromJust charDataResult)
| otherwise = processResult (lexAny input offset lexFunctions)
where
lexFunctions = [ lexWhitespace
, lexDeclarationStart
......@@ -208,11 +204,11 @@ where
dataMode [TokenCData _: _] = True
dataMode _ = False
charDataResult = lexCharData input offset
charDataResult = lexCharData '<' False input offset
processResult r = case r of
Token offset token = lex input offset [token:tokens] //Lex another token and do recursive call
NoToken offset = lex input offset tokens
Token offset token = lex input (inc offset) [token:tokens] //Lex another token and do recursive call
NoToken offset = lex input (inc offset) tokens
Fail err = Error err
//Try any of the lexers in the list until one succeeds
......@@ -229,7 +225,7 @@ where
lexEmptyTagClose = lexFixed "/>" TokenEmptyTagClose
lexTagClose = lexFixed ">" TokenTagClose
lexDeclarationStart input offset = case lexFixed "<?xml" TokenDeclarationStart input offset of
lexDeclarationStart input offset = case lexFixed "<?xml" TokenDeclarationStart input offset of
Nothing = Nothing
Just res
| offset == 0 = Just res
......@@ -242,7 +238,7 @@ where
Just $
maybe
(Fail "CDATA start without end")
(\endIdx -> Token (endIdx + 4) $ TokenCData $ input % (offset + 9, endIdx))
(\endIdx -> Token (endIdx + 3) $ TokenCData $ input % (offset + 9, endIdx))
(dataEndIndex $ inc offset)
where
dataEndIndex :: !Int -> Maybe Int
......@@ -252,8 +248,8 @@ where
| otherwise = dataEndIndex $ inc curIndex
//Char data
lexCharData :: !String !Int -> Maybe LexFunctionResult
lexCharData input offset =
lexCharData :: !Char !Bool !String !Int -> Maybe LexFunctionResult
lexCharData endChar endCharBelongsToToken input offset =
case lexCharData` offset [] of
Error e = Just $ Fail e
Ok ([], _) = Nothing
......@@ -263,18 +259,18 @@ where
lexCharData` offset accum
| input.[offset] == '&'
# end = findEnd (\c -> c >= 'a' && c <= 'z') input (offset + 1)
| input.[end] <> ';' = Error "Missing ';' at end of character entity"
| input.[end] <> ';' = Error $ "Missing ';' at end of character entity"
# name = input % (offset + 1, end - 1)
= maybe
(Error $ concat ["Unknown named character entity reference '", name, "'"])
(\charString -> lexCharData` (end + 1) [charString: accum])
(entityCharacter name)
| input.[offset] <> '<'
| input.[offset] <> endChar
# end = findEnd isTextChar input (offset + 1)
= lexCharData` end [input % (offset, end - 1): accum]
| otherwise = Ok (reverse accum, offset)
| otherwise = Ok (reverse accum, if endCharBelongsToToken offset (offset - 1))
where
isTextChar c = c <> '<' && c <> '&'
isTextChar c = c <> endChar && c <> '&'
entityCharacter :: !String -> Maybe String
entityCharacter "quot" = Just "\""
......@@ -286,7 +282,7 @@ where
//Names
lexName input offset
| isNameStartChar input.[offset] = Just (Token end (TokenName (input % (offset, end - 1))))
| isNameStartChar input.[offset] = Just (Token (dec end) (TokenName (input % (offset, end - 1))))
| otherwise = Nothing
where
end = findEnd isNameChar input (offset + 1)
......@@ -305,21 +301,14 @@ where
//AttrValue
lexAttrValue input offset
| input.[offset] <> '"' = Nothing
= Just (Token end (TokenAttrValue (input % (offset + 1, end - 2))))
where
end = findAttrValueEnd input (offset + 1)
findAttrValueEnd input offset
| offset >= size input = offset
| input.[offset] == '"' = offset + 1
| otherwise = findAttrValueEnd input (offset + 1)
| input.[offset] <> '"' = Nothing
| otherwise = lexCharData '"' True input (inc offset)
lexWhitespace input offset
| last == offset = Nothing
= Just (NoToken last)
| next == offset = Nothing
| otherwise = Just (NoToken $ dec next)
where
last = findEnd isWhitespace input offset
next = findEnd isWhitespace input offset
isWhitespace '\x20' = True
isWhitespace '\x9' = True
......@@ -329,14 +318,14 @@ where
//Lex token of fixed size
lexFixed chars token input offset
| input % (offset,offset + (size chars) - 1) == chars = Just (Token (offset + size chars) token)
= Nothing
| input % (offset,offset + (size chars) - 1) == chars = Just (Token (offset + size chars - 1) token)
| otherwise = Nothing
//Find the first offset where the predicate no longer holds
//Find the first offset where the predicate no longer holds
findEnd pred input offset
| offset >= size input = offset
| pred input.[offset] = findEnd pred input (offset + 1)
= offset
| offset >= size input = offset
| pred input.[offset] = findEnd pred input (offset + 1)
| otherwise = offset
pXMLDoc :: Parser Token XMLDoc
pXMLDoc = begin1 pXMLDoc`
......@@ -365,7 +354,7 @@ pElemStart = (\name attributes -> (name,attributes)) @> symbol TokenStartTagOpe
pElemContEnd = symbol TokenEndTagOpen &> pName <& symbol TokenTagClose
pAttr = (\name v -> XMLAttr (toQName name) v) @> pName +&- symbol TokenEqual +&+ pAttrValue
pName = satisfy isName <@ \n -> case n of TokenName n -> n; _ -> abort "error in pName\n"
pAttrValue = satisfy isAttrValue <@ \n -> case n of TokenAttrValue v -> v; _ -> abort "error in pAttrValue\n"
pAttrValue = satisfy isCharData <@ \n -> case n of TokenCharData v -> v; _ -> abort "error in pAttrValue\n"
pCharData = satisfy isCharData <@ \n -> case n of TokenCharData d -> d; _ -> abort "error in pCharData\n"
pCData = satisfy (\t -> t =: TokenCData _) <@ \t -> case t of TokenCData data = XMLCData data; _ = undef
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment