Commit 631806b9 authored by Camil Staps's avatar Camil Staps 🚀

Merge branch 'fixXmlBug' into 'master'

fix bugs in XML decoder

See merge request !322
parents c63ccb61 3d409129
Pipeline #41384 passed with stage
in 1 minute and 48 seconds
......@@ -23,9 +23,9 @@ definition module Text.GenXML
* nodesWithMergedConsecutiveText :: ![XMLNode] -> [XMLNode]
* nodesWithMergedConsecutiveText [] = []
* nodesWithMergedConsecutiveText [XMLText text1, XMLText text2: rest] =
* nodesWithMergedConsecutiveText [XMLText $ text1 +++ text2: rest]
* nodesWithMergedConsecutiveText [XMLText $ text1 +++ text2: rest]
* nodesWithMergedConsecutiveText [node: rest] =
* [nodeWithMergedConsecutiveText node: nodesWithMergedConsecutiveText rest]
* [nodeWithMergedConsecutiveText node: nodesWithMergedConsecutiveText rest]
*/
import StdOverloaded, StdGeneric, Data.Either
......
......@@ -58,9 +58,11 @@ where
where
escapedSize` s n i
| i == n = 0
| s.[i] == '<' = 4 + escapedSize` s n (i + 1)
| s.[i] == '>' = 4 + escapedSize` s n (i + 1)
| s.[i] == '&' = 5 + escapedSize` s n (i + 1)
| s.[i] == '<' = 4 + escapedSize` s n (i + 1)
| s.[i] == '>' = 4 + escapedSize` s n (i + 1)
| s.[i] == '&' = 5 + escapedSize` s n (i + 1)
| s.[i] == '\'' = 6 + escapedSize` s n (i + 1)
| s.[i] == '"' = 6 + escapedSize` s n (i + 1)
| otherwise = 1 + escapedSize` s n (i + 1)
serializeDoc :: !XMLDoc !*{#Char} !Int -> (!*{#Char}, !Int)
......@@ -138,8 +140,25 @@ where
# dest = {dest & [dest_i] = '&', [dest_i + 1] = 'g', [dest_i + 2] = 't', [dest_i + 3] = ';'}
= copyChars src (src_i + 1) escape dest (dest_i + 4)
| escape && (src.[src_i] == '&')
# dest = {dest & [dest_i] = '&', [dest_i + 1] = 'a', [dest_i + 2] = 'm', [dest_i + 3] = 'p', [dest_i + 4] = ';'}
# dest =
{ dest
& [dest_i] = '&', [dest_i + 1] = 'a', [dest_i + 2] = 'm', [dest_i + 3] = 'p', [dest_i + 4] = ';'
}
= copyChars src (src_i + 1) escape dest (dest_i + 5)
| escape && (src.[src_i] == '"')
# dest =
{ dest
& [dest_i] = '&', [dest_i + 1] = 'q', [dest_i + 2] = 'u', [dest_i + 3] = 'o'
, [dest_i + 4] = 't', [dest_i + 5] = ';'
}
= copyChars src (src_i + 1) escape dest (dest_i + 6)
| escape && (src.[src_i] == '\'')
# dest =
{ dest
& [dest_i] = '&', [dest_i + 1] = 'a', [dest_i + 2] = 'p', [dest_i + 3] = 'o'
, [dest_i + 4] = 's', [dest_i + 5] = ';'
}
= copyChars src (src_i + 1) escape dest (dest_i + 6)
| otherwise
# dest = {dest & [dest_i] = src.[src_i]}
= copyChars src (src_i + 1) escape dest (dest_i + 1)
......@@ -154,8 +173,7 @@ where
= Ok (snd (hd xmlDoc))
//Token type which is the intermediary representation during XML parsing
:: Token = TokenAttrValue !String
| TokenCharData !String
:: Token = TokenCharData !String
| TokenName !String
| TokenStartTagOpen
| TokenTagClose
......@@ -176,18 +194,15 @@ isName _ = False
isCharData (TokenCharData _) = True
isCharData _ = False
isAttrValue (TokenAttrValue _) = True
isAttrValue _ = False
:: LexFunctionResult = Token !Int !Token | NoToken !Int | Fail !String
:: LexFunction :== String Int -> Maybe LexFunctionResult
lex :: !String !Int ![Token] -> MaybeErrorString [Token]
lex input offset tokens
| offset >= size input = Ok (reverse tokens) //Done
| dataMode tokens && isJust charDataResult = processResult (fromJust charDataResult)
| otherwise = processResult (lexAny input offset lexFunctions)
where
lexFunctions = [ lexWhitespace
, lexDeclarationStart
......@@ -208,11 +223,11 @@ where
dataMode [TokenCData _: _] = True
dataMode _ = False
charDataResult = lexCharData input offset
charDataResult = lexCharData '<' False input offset
processResult r = case r of
Token offset token = lex input offset [token:tokens] //Lex another token and do recursive call
NoToken offset = lex input offset tokens
Token offset token = lex input (inc offset) [token:tokens] //Lex another token and do recursive call
NoToken offset = lex input (inc offset) tokens
Fail err = Error err
//Try any of the lexers in the list until one succeeds
......@@ -229,7 +244,7 @@ where
lexEmptyTagClose = lexFixed "/>" TokenEmptyTagClose
lexTagClose = lexFixed ">" TokenTagClose
lexDeclarationStart input offset = case lexFixed "<?xml" TokenDeclarationStart input offset of
lexDeclarationStart input offset = case lexFixed "<?xml" TokenDeclarationStart input offset of
Nothing = Nothing
Just res
| offset == 0 = Just res
......@@ -242,7 +257,7 @@ where
Just $
maybe
(Fail "CDATA start without end")
(\endIdx -> Token (endIdx + 4) $ TokenCData $ input % (offset + 9, endIdx))
(\endIdx -> Token (endIdx + 3) $ TokenCData $ input % (offset + 9, endIdx))
(dataEndIndex $ inc offset)
where
dataEndIndex :: !Int -> Maybe Int
......@@ -252,31 +267,29 @@ where
| otherwise = dataEndIndex $ inc curIndex
//Char data
lexCharData :: !String !Int -> Maybe LexFunctionResult
lexCharData input offset =
lexCharData :: !Char !Bool !String !Int -> Maybe LexFunctionResult
lexCharData endChar endCharBelongsToToken input offset =
case lexCharData` offset [] of
Error e = Just $ Fail e
Ok ([], _) = Nothing
Ok (dataStrings, end) = Just $ Token end (TokenCharData $ concat dataStrings)
Ok (dataStrings, end) = Just $ Token end (TokenCharData $ trim $ concat dataStrings)
where
lexCharData` :: !Int ![String] -> MaybeErrorString (![String], !Int)
lexCharData` offset accum
| input.[offset] == '&'
# end = findEnd (\c -> c >= 'a' && c <= 'z') input (offset + 1)
| input.[end] <> ';' = Error "Missing ';' at end of character entity"
| input.[end] <> ';' = Error $ "Missing ';' at end of character entity"
# name = input % (offset + 1, end - 1)
= maybe
(Error $ concat ["Unknown named character entity reference '", name, "'"])
(\charString -> lexCharData` (end + 1) [charString: accum])
(entityCharacter name)
| isTextChar input.[offset]
| input.[offset] <> endChar
# end = findEnd isTextChar input (offset + 1)
# data = trim (input % (offset, end - 1))
| data <> "" = lexCharData` end [data: accum]
| otherwise = Ok (accum, offset)
| otherwise = Ok (reverse accum, offset)
= lexCharData` end [input % (offset, end - 1): accum]
| otherwise = Ok (reverse accum, if endCharBelongsToToken offset (offset - 1))
where
isTextChar c = c <> '<' && c <> '&'
isTextChar c = c <> endChar && c <> '&'
entityCharacter :: !String -> Maybe String
entityCharacter "quot" = Just "\""
......@@ -288,7 +301,7 @@ where
//Names
lexName input offset
| isNameStartChar input.[offset] = Just (Token end (TokenName (input % (offset, end - 1))))
| isNameStartChar input.[offset] = Just (Token (dec end) (TokenName (input % (offset, end - 1))))
| otherwise = Nothing
where
end = findEnd isNameChar input (offset + 1)
......@@ -307,21 +320,14 @@ where
//AttrValue
lexAttrValue input offset
| input.[offset] <> '"' = Nothing
= Just (Token end (TokenAttrValue (input % (offset + 1, end - 2))))
where
end = findAttrValueEnd input (offset + 1)
findAttrValueEnd input offset
| offset >= size input = offset
| input.[offset] == '"' = offset + 1
| otherwise = findAttrValueEnd input (offset + 1)
| input.[offset] <> '"' = Nothing
| otherwise = lexCharData '"' True input (inc offset)
lexWhitespace input offset
| last == offset = Nothing
= Just (NoToken last)
| next == offset = Nothing
| otherwise = Just (NoToken $ dec next)
where
last = findEnd isWhitespace input offset
next = findEnd isWhitespace input offset
isWhitespace '\x20' = True
isWhitespace '\x9' = True
......@@ -331,14 +337,14 @@ where
//Lex token of fixed size
lexFixed chars token input offset
| input % (offset,offset + (size chars) - 1) == chars = Just (Token (offset + size chars) token)
= Nothing
| input % (offset,offset + (size chars) - 1) == chars = Just (Token (offset + size chars - 1) token)
| otherwise = Nothing
//Find the first offset where the predicate no longer holds
//Find the first offset where the predicate no longer holds
findEnd pred input offset
| offset >= size input = offset
| pred input.[offset] = findEnd pred input (offset + 1)
= offset
| offset >= size input = offset
| pred input.[offset] = findEnd pred input (offset + 1)
| otherwise = offset
pXMLDoc :: Parser Token XMLDoc
pXMLDoc = begin1 pXMLDoc`
......@@ -367,7 +373,7 @@ pElemStart = (\name attributes -> (name,attributes)) @> symbol TokenStartTagOpe
pElemContEnd = symbol TokenEndTagOpen &> pName <& symbol TokenTagClose
pAttr = (\name v -> XMLAttr (toQName name) v) @> pName +&- symbol TokenEqual +&+ pAttrValue
pName = satisfy isName <@ \n -> case n of TokenName n -> n; _ -> abort "error in pName\n"
pAttrValue = satisfy isAttrValue <@ \n -> case n of TokenAttrValue v -> v; _ -> abort "error in pAttrValue\n"
pAttrValue = satisfy isCharData <@ \n -> case n of TokenCharData v -> v; _ -> abort "error in pAttrValue\n"
pCharData = satisfy isCharData <@ \n -> case n of TokenCharData d -> d; _ -> abort "error in pCharData\n"
pCData = satisfy (\t -> t =: TokenCData _) <@ \t -> case t of TokenCData data = XMLCData data; _ = undef
......
......@@ -11,12 +11,17 @@ derive ggen XMLNode, XMLAttr
// TODO: Generate URIs for namespaces, instead of using names.
ggen{|XMLDoc|} st =
[ XMLDoc
(unNameString <$> defaultNamespace)
(bifmap unNameString unNameString <$> namespaces)
(XMLElem rootName rootAttrs rootChildren)
\\ (defaultNamespace, namespaces, rootName, rootAttrs, rootChildren) <- ggen{|*|} st
[ docWithNamedCharacterEntityReference
: [ XMLDoc
(unNameString <$> defaultNamespace)
(bifmap unNameString unNameString <$> namespaces)
(XMLElem rootName rootAttrs rootChildren)
\\ (defaultNamespace, namespaces, rootName, rootAttrs, rootChildren) <- ggen{|*|} st
]
]
where
docWithNamedCharacterEntityReference =
XMLDoc Nothing [] (XMLElem (uname "someName") [XMLAttr (uname "attr") "< >\" '&"] [XMLText "\"& <>\""])
ggen{|XMLQName|} st = [XMLQName (unNameString <$> namespace) (unNameString name) \\ (namespace, name) <- ggen{|*|} st]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment