Verified Commit f36157fd authored by Steffen Michels's avatar Steffen Michels Committed by Camil Staps

add support for numeric character references in XML strings

parent 0f49b68a
Pipeline #41508 passed with stage
in 1 minute and 47 seconds
......@@ -5,28 +5,11 @@ definition module Text.GenXML
* a generic conversion of Clean values to XML.
*
* @property-bootstrap
* import StdEnv, Data.Error, Data.Func, Text.GenXML.Gast, Text.GenXML.GenPrint
* import StdEnv, Data.Error, Data.Func, Text.GenXML.Gast, Text.GenXML.GenPrint, Data.Maybe.Gast
*
* derive genShow MaybeError
* derive gPrint MaybeError
* derive class Gast Maybe
*
* withMergedConsecutiveText :: !XMLDoc -> XMLDoc
* withMergedConsecutiveText (XMLDoc defaultNamespace namespaces rootNode) =
* XMLDoc defaultNamespace namespaces $ nodeWithMergedConsecutiveText rootNode
*
* nodeWithMergedConsecutiveText :: !XMLNode -> XMLNode
* nodeWithMergedConsecutiveText (XMLElem name attrs nodes) =
* XMLElem name attrs $ nodesWithMergedConsecutiveText nodes
* nodeWithMergedConsecutiveText node = node
*
* nodesWithMergedConsecutiveText :: ![XMLNode] -> [XMLNode]
* nodesWithMergedConsecutiveText [] = []
* nodesWithMergedConsecutiveText [XMLText text1, XMLText text2: rest] =
* nodesWithMergedConsecutiveText [XMLText $ text1 +++ text2: rest]
* nodesWithMergedConsecutiveText [node: rest] =
* [nodeWithMergedConsecutiveText node: nodesWithMergedConsecutiveText rest]
*/
* derive genShow MaybeError
* derive gPrint MaybeError
*/
import StdOverloaded, StdGeneric, Data.Either
from Data.Maybe import :: Maybe
......@@ -81,8 +64,8 @@ instance toString XMLDoc
/**
* The XML document corresponding to a string.
*
* @property correctness: A.doc :: XMLDoc:
* fromString (toString doc) =.= typedOk (withMergedConsecutiveText doc)
* @property correctness: A.doc :: XMLDocWithString:
* fromString doc.stringRepresentation =.= typedOk doc.document
* where
* typedOk :: !XMLDoc -> MaybeErrorString XMLDoc
* typedOk doc = Ok doc
......
implementation module Text.GenXML
import StdEnv
import Data.Error, Data.Either, Data.Maybe, Text, Data.GenEq, Data.Func, Data._Array
import Data.Error, Data.Either, Data.Maybe, Text, Data.GenEq, Data.Func, Data._Array, Data.Functor
from Control.Applicative import class <*> (..)
import Text.GenParse
from Text.Parsers.CParsers.ParserCombinators import :: Parser, :: ParsResult, :: CParser, &>, +&+, +&-, -&+, <!>, <&, <&>, <*?>, <@, >?<, @>, begin1, satisfy, symbol, yield, <|>, <+?>, fail
instance == XMLDoc where
......@@ -277,11 +279,11 @@ where
lexCharData` :: !Int ![String] -> MaybeErrorString (![String], !Int)
lexCharData` offset accum
| input.[offset] == '&'
# end = findEnd (\c -> c >= 'a' && c <= 'z') input (offset + 1)
# end = findEnd (\c -> c <> ';') input (offset + 1)
| input.[end] <> ';' = Error $ "Missing ';' at end of character entity"
# name = input % (offset + 1, end - 1)
= maybe
(Error $ concat ["Unknown named character entity reference '", name, "'"])
(Error $ concat ["Unknown character entity reference '", name, "'"])
(\charString -> lexCharData` (end + 1) [charString: accum])
(entityCharacter name)
| input.[offset] <> endChar
......@@ -297,7 +299,30 @@ where
entityCharacter "apos" = Just "'"
entityCharacter "lt" = Just "<"
entityCharacter "gt" = Just ">"
entityCharacter _ = Nothing
entityCharacter ref
| ref.[0] == '#' && ref.[1] == 'x'
| refSize == 4 = (\d1 d2 -> {shifted d1 + d2}) <$> d1 <*> d2
| refSize == 6 = (\d1 d2 d3 d4 -> {shifted d1 + d2, shifted d3 + d4}) <$> d1 <*> d2 <*> d3 <*> d4
| otherwise = Nothing
where
d1 = valueOf ref.[2]
d2 = valueOf ref.[3]
d3 = valueOf ref.[4]
d4 = valueOf ref.[5]
refSize = size ref
shifted :: !Char -> Char
shifted c = toChar (toInt c << 4)
valueOf :: !Char -> Maybe Char
valueOf c
| not $ isHexDigit c = Nothing
| isDigit c = Just $ c - '0'
| isLower c = Just $ c - 'a' + '\d10'
| isUpper c = Just $ c - 'A' + '\d10'
| otherwise = Nothing
entityCharacter ref | ref.[0] == '#' = (\i -> {fromInt i}) <$> parseString (ref % (1, size ref - 1))
entityCharacter _ = Nothing
//Names
lexName input offset
......
definition module Text.GenXML.Gast
from Text.GenPrint import generic gPrint, class PrintOutput, :: PrintState
from Gast import generic ggen, generic genShow, :: GenState
from Text.GenXML import :: XMLDoc, :: XMLNode, :: XMLAttr, :: XMLQName
derive ggen XMLDoc, XMLQName, XMLNode, XMLAttr
derive genShow XMLDoc, XMLQName, XMLNode, XMLAttr
//* An XML doc with the corresponding string representation.
:: XMLDocWithString = {document :: !XMLDoc, stringRepresentation :: !String}
derive ggen XMLDocWithString, XMLDoc, XMLQName, XMLNode, XMLAttr
derive genShow XMLDocWithString, XMLDoc, XMLQName, XMLNode, XMLAttr
derive gPrint XMLDocWithString
......@@ -3,25 +3,47 @@ implementation module Text.GenXML.Gast
import StdEnv
import Gast
import Data.Func, Data.Functor, Data.Maybe, Data.Maybe.Gast, Data.List, Data.Bifunctor, Data.Tuple
import Text.GenXML
import Text.GenXML, Text.GenXML.GenPrint
import Control.GenBimap
derive genShow XMLDoc, XMLQName, XMLNode, XMLAttr
derive ggen XMLNode, XMLAttr
ggen{|XMLDocWithString|} st =
[ docWithHexCharacterReferences
, docWithDecCharacterReferences
: (\doc -> {document = doc, stringRepresentation = toString doc}) <$> docs
]
where
// This cannot be generated by `toString`, as UTF-8 characters are included in the string representation unencoded.
docWithHexCharacterReferences =
{ document = testDocWith [XMLAttr (uname "a") "ó ¾"] [XMLText "Test"]
, stringRepresentation =
"<?xml version=\"1.0}\" standalone=\"no\"?><root a=\"&#xc3;&#xb3; &#xc2bE;\">&#x54;&#x65;&#x73;&#x74;</root>"
}
docWithDecCharacterReferences =
{ document = testDocWith [XMLAttr (uname "a") "ó"] [XMLText "Test"]
, stringRepresentation =
"<?xml version=\"1.0}\" standalone=\"no\"?><root a=\"&#195;&#179;\">&#84;&#101;&#115;&#116;</root>"
}
docs :: [XMLDoc]
docs = ggen{|*|} st
// TODO: Generate URIs for namespaces, instead of using names.
ggen{|XMLDoc|} st =
[ docWithNamedCharacterEntityReference
: [ XMLDoc
(unNameString <$> defaultNamespace)
(bifmap unNameString unNameString <$> namespaces)
(XMLElem rootName rootAttrs rootChildren)
\\ (defaultNamespace, namespaces, rootName, rootAttrs, rootChildren) <- ggen{|*|} st
withMergedConsecutiveText <$>
[ docWithNamedCharacterEntityReference
: [ XMLDoc
(unNameString <$> defaultNamespace)
(bifmap unNameString unNameString <$> namespaces)
(XMLElem rootName rootAttrs rootChildren)
\\ (defaultNamespace, namespaces, rootName, rootAttrs, rootChildren) <- ggen{|*|} st
]
]
]
where
docWithNamedCharacterEntityReference =
XMLDoc Nothing [] (XMLElem (uname "someName") [XMLAttr (uname "attr") "< >\" '&"] [XMLText "\"& <>\""])
docWithNamedCharacterEntityReference = testDocWith [XMLAttr (uname "attr") "< >\" '&"] [XMLText "\"& <>\""]
testDocWith :: ![XMLAttr] ![XMLNode] -> XMLDoc
testDocWith attrs content = XMLDoc Nothing [] (XMLElem (uname "root") attrs content)
ggen{|XMLQName|} st = [XMLQName (unNameString <$> namespace) (unNameString name) \\ (namespace, name) <- ggen{|*|} st]
......@@ -32,3 +54,23 @@ unNameString (NameString str) = str
// TODO: Also include capital letters and other valid characters.
ggen{|NameString|} _ = [NameString str \\ str <- ggenString 7 4.0 97 122 aStream | str <> ""]
withMergedConsecutiveText :: !XMLDoc -> XMLDoc
withMergedConsecutiveText (XMLDoc defaultNamespace namespaces rootNode) =
XMLDoc defaultNamespace namespaces $ nodeWithMergedConsecutiveText rootNode
where
nodeWithMergedConsecutiveText :: !XMLNode -> XMLNode
nodeWithMergedConsecutiveText (XMLElem name attrs nodes) =
XMLElem name attrs $ nodesWithMergedConsecutiveText nodes
nodeWithMergedConsecutiveText node = node
nodesWithMergedConsecutiveText :: ![XMLNode] -> [XMLNode]
nodesWithMergedConsecutiveText [] = []
nodesWithMergedConsecutiveText [XMLText text1, XMLText text2: rest] =
nodesWithMergedConsecutiveText [XMLText $ text1 +++ text2: rest]
nodesWithMergedConsecutiveText [node: rest] =
[nodeWithMergedConsecutiveText node: nodesWithMergedConsecutiveText rest]
derive ggen XMLNode, XMLAttr
derive genShow XMLDocWithString, XMLDoc, XMLQName, XMLNode, XMLAttr
derive gPrint XMLDocWithString
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment