[XSL-LIST Mailing List Archive Home]
[By Thread]
[By Date]
On 2011-05-20 17:34, Julian Reschke wrote:
Thanks for all the feedback. In the end I went for a pure XSLT2 implementation, supporting ISO-8859-1 and UTF-8. See below.
I'm doing a lot of XSLT 1.0 but not so much XSLT 2.0, so comments on how to make this more elegant are welcome.
XSLT (to be applied to some random XML):
<xsl:output method="xml" indent="yes"/>
<xsl:variable name="attr-char">!#\$&\+\-\.\^_`\|~<xsl:value-of select="$DIGIT"/><xsl:value-of select="$ALPHA"/></xsl:variable>
<xsl:variable name="DIGIT">0-9</xsl:variable>
<xsl:variable name="ALPHA">a-zA-Z</xsl:variable>
<xsl:variable name="HEXDIG">a-fA-F<xsl:value-of select="$DIGIT"/></xsl:variable>
<xsl:variable name="pct-encoded">%[<xsl:value-of select="$HEXDIG"/>][<xsl:value-of select="$HEXDIG"/>]</xsl:variable>
<xsl:variable name="reg">(<xsl:value-of select="$pct-encoded"/>)|[<xsl:value-of select="$attr-char"/>]</xsl:variable>
<xsl:variable name="digits" select="('0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F')"/>
<xsl:analyze-string select="$s" regex="{$reg}" flags="mx">
<xsl:matching-substring>
<xsl:choose>
<xsl:when test="starts-with(.,'%')">
<xsl:variable name="a" select="index-of($digits,upper-case(substring(.,2,1)))-1"/>
<xsl:variable name="b" select="index-of($digits,upper-case(substring(.,3,1)))-1"/>
<xsl:variable name="cp" select="$a * 16 + $b"/>
<xsl:choose>
<xsl:when test="$cp >= 128">
<octet><xsl:value-of select="$cp"/></octet>
</xsl:when>
<xsl:otherwise>
<c><xsl:value-of select="codepoints-to-string($cp)"/></c>
</xsl:otherwise>
</xsl:choose>
</xsl:when>
<xsl:otherwise>
<!-- single character -->
<c><xsl:value-of select="."/></c>
</xsl:otherwise>
</xsl:choose>
</xsl:matching-substring>
</xsl:analyze-string>
</xsl:function>
<xsl:choose>
<xsl:when test="$result/illegal-octet">
<illegal-octet><xsl:value-of select="$result/illegal-octet"/></illegal-octet>
</xsl:when>
<xsl:otherwise>
<string><xsl:value-of select="$result"/></string>
</xsl:otherwise>
</xsl:choose>
</xsl:function>
<xsl:variable name="octets">
<xsl:for-each select="$s">
<o>
<xsl:choose>
<xsl:when test="self::octet"><xsl:value-of select="."/></xsl:when>
<xsl:otherwise><xsl:value-of select="string-to-codepoints(.)"/></xsl:otherwise>
</xsl:choose>
</o>
</xsl:for-each>
</xsl:variable>
<xsl:choose>
<xsl:when test="$result/illegal-octet">
<illegal-octet><xsl:value-of select="$result/illegal-octet"/></illegal-octet>
</xsl:when>
<xsl:otherwise>
<string><xsl:value-of select="$result"/></string>
</xsl:otherwise>
</xsl:choose>
</xsl:function>
<xsl:choose>
<xsl:when test="not($octets)"><!--done--></xsl:when>
<xsl:when test="count($octets) >= 4 and $octets[1] >= 240 and $octets[2] >= 128 and $octets[3] >= 128 and $octets[4] >= 128">
<xsl:value-of select="codepoints-to-string(xs:integer(((((($octets[1] mod 32) * 64) + ($octets[2] mod 32)) * 64) + ($octets[3] mod 64) * 64) + ($octets[4] mod 64)))"/>
<xsl:call-template name="internal-utf8">
<xsl:with-param name="octets" select="$octets[position() > 4]"/>
</xsl:call-template>
</xsl:when>
<xsl:when test="count($octets) >= 3 and $octets[1] >= 224 and $octets[2] >= 128 and $octets[3] >= 128">
<xsl:value-of select="codepoints-to-string(xs:integer((((($octets[1] mod 32) * 64) + ($octets[2] mod 32)) * 64) + ($octets[3] mod 64)))"/>
<xsl:call-template name="internal-utf8">
<xsl:with-param name="octets" select="$octets[position() > 3]"/>
</xsl:call-template>
</xsl:when>
<xsl:when test="count($octets) >= 2 and $octets[1] >= 192 and $octets[2] >= 128">
<xsl:value-of select="codepoints-to-string(xs:integer((($octets[1] mod 32) * 64) + ($octets[2] mod 64)))"/>
<xsl:call-template name="internal-utf8">
<xsl:with-param name="octets" select="$octets[position() > 2]"/>
</xsl:call-template>
</xsl:when>
<xsl:when test="$octets[1] < 128">
<xsl:value-of select="codepoints-to-string($octets[1])"/>
<xsl:call-template name="internal-utf8">
<xsl:with-param name="octets" select="$octets[position() > 1]"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<illegal-octet><xsl:value-of select="$octets[1]"/></illegal-octet>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
</xsl:transform>
Output:
Best regards, Julian
Re: [xsl] decoding percent-escaped octet sequences
Subject: Re: [xsl] decoding percent-escaped octet sequences From: Julian Reschke <julian.reschke@xxxxxx> Date: Thu, 26 May 2011 10:56:02 +0200 |
On 2011-05-20 17:34, Julian Reschke wrote:
Hi,
do XSLT/XPath2 offer an elegant way to convert percent-escaped octet sequences to strings (both for ISO-8859-1 and UTF-8).
So far I found codepoints-to-string, but that would mean that I'd still have to to
1) percent-escaped-string to sequence of octets, and 2) sequence-of UTF-8 octets to seequence of codepoints.
Did I miss something here?
Best regards, Julian ...
Thanks for all the feedback. In the end I went for a pure XSLT2 implementation, supporting ISO-8859-1 and UTF-8. See below.
I'm doing a lot of XSLT 1.0 but not so much XSLT 2.0, so comments on how to make this more elegant are welcome.
XSLT (to be applied to some random XML):
<?xml version="1.0" encoding="ISO-8859-1"?> <xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="2.0" xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:myns="mailto:julian.reschke@xxxxxxxxxxxxx?subject=pctdecode" exclude-result-prefixes="myns" >
<xsl:output method="xml" indent="yes"/>
<xsl:template match="/"> <results> <xsl:copy-of select="myns:test('utf-8','A%20C')"/> <xsl:copy-of select="myns:test('iso-8859-1','A%20C')"/> <xsl:copy-of select="myns:test('utf-8','A%C3%A4')"/> <xsl:copy-of select="myns:test('iso-8859-1','A%E4')"/> <xsl:copy-of select="myns:test('utf-8','A%E4')"/> </results> </xsl:template>
<xsl:function name="myns:test"> <xsl:param name="enc"/> <xsl:param name="value"/>
<result> <input> <enc><xsl:value-of select="$enc"/></enc> <value><xsl:value-of select="$value"/></value> </input> <parsed> <xsl:variable name="raw" select="myns:pct-decode($value)"/>
<xsl:choose> <xsl:when test="lower-case($enc)='iso-8859-1'"> <xsl:copy-of select="myns:decode-iso-8859-1($raw)"/> </xsl:when> <xsl:when test="lower-case($enc)='utf-8'"> <xsl:copy-of select="myns:decode-utf-8($raw)"/> </xsl:when> <xsl:otherwise> <!-- unsupported encoding --> </xsl:otherwise> </xsl:choose> </parsed> </result> </xsl:function>
<xsl:variable name="attr-char">!#\$&\+\-\.\^_`\|~<xsl:value-of select="$DIGIT"/><xsl:value-of select="$ALPHA"/></xsl:variable>
<xsl:variable name="DIGIT">0-9</xsl:variable>
<xsl:variable name="ALPHA">a-zA-Z</xsl:variable>
<xsl:variable name="HEXDIG">a-fA-F<xsl:value-of select="$DIGIT"/></xsl:variable>
<xsl:variable name="pct-encoded">%[<xsl:value-of select="$HEXDIG"/>][<xsl:value-of select="$HEXDIG"/>]</xsl:variable>
<xsl:function name="myns:pct-decode"> <xsl:param name="s"/>
<xsl:variable name="reg">(<xsl:value-of select="$pct-encoded"/>)|[<xsl:value-of select="$attr-char"/>]</xsl:variable>
<xsl:variable name="digits" select="('0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F')"/>
<xsl:analyze-string select="$s" regex="{$reg}" flags="mx">
<xsl:matching-substring>
<xsl:choose>
<xsl:when test="starts-with(.,'%')">
<xsl:variable name="a" select="index-of($digits,upper-case(substring(.,2,1)))-1"/>
<xsl:variable name="b" select="index-of($digits,upper-case(substring(.,3,1)))-1"/>
<xsl:variable name="cp" select="$a * 16 + $b"/>
<xsl:choose>
<xsl:when test="$cp >= 128">
<octet><xsl:value-of select="$cp"/></octet>
</xsl:when>
<xsl:otherwise>
<c><xsl:value-of select="codepoints-to-string($cp)"/></c>
</xsl:otherwise>
</xsl:choose>
</xsl:when>
<xsl:otherwise>
<!-- single character -->
<c><xsl:value-of select="."/></c>
</xsl:otherwise>
</xsl:choose>
</xsl:matching-substring>
</xsl:analyze-string>
</xsl:function>
<xsl:function name="myns:decode-iso-8859-1"> <xsl:param name="s"/>
<xsl:variable name="result"> <xsl:for-each select="$s"> <xsl:choose> <xsl:when test="self::octet"> <xsl:choose> <xsl:when test=". > 127 and . < 160"> <illegal-octet><xsl:value-of select="."/></illegal-octet> </xsl:when> <xsl:otherwise> <xsl:value-of select="codepoints-to-string(.)"/> </xsl:otherwise> </xsl:choose> </xsl:when> <xsl:otherwise> <xsl:value-of select="."/> </xsl:otherwise> </xsl:choose> </xsl:for-each> </xsl:variable>
<xsl:choose>
<xsl:when test="$result/illegal-octet">
<illegal-octet><xsl:value-of select="$result/illegal-octet"/></illegal-octet>
</xsl:when>
<xsl:otherwise>
<string><xsl:value-of select="$result"/></string>
</xsl:otherwise>
</xsl:choose>
</xsl:function>
<xsl:function name="myns:decode-utf-8"> <xsl:param name="s"/>
<xsl:variable name="octets">
<xsl:for-each select="$s">
<o>
<xsl:choose>
<xsl:when test="self::octet"><xsl:value-of select="."/></xsl:when>
<xsl:otherwise><xsl:value-of select="string-to-codepoints(.)"/></xsl:otherwise>
</xsl:choose>
</o>
</xsl:for-each>
</xsl:variable>
<xsl:variable name="result"> <xsl:call-template name="internal-utf8"> <xsl:with-param name="octets" select="$octets/*"/> </xsl:call-template> </xsl:variable>
<xsl:choose>
<xsl:when test="$result/illegal-octet">
<illegal-octet><xsl:value-of select="$result/illegal-octet"/></illegal-octet>
</xsl:when>
<xsl:otherwise>
<string><xsl:value-of select="$result"/></string>
</xsl:otherwise>
</xsl:choose>
</xsl:function>
<xsl:template name="internal-utf8"> <xsl:param name="octets"/>
<xsl:choose>
<xsl:when test="not($octets)"><!--done--></xsl:when>
<xsl:when test="count($octets) >= 4 and $octets[1] >= 240 and $octets[2] >= 128 and $octets[3] >= 128 and $octets[4] >= 128">
<xsl:value-of select="codepoints-to-string(xs:integer(((((($octets[1] mod 32) * 64) + ($octets[2] mod 32)) * 64) + ($octets[3] mod 64) * 64) + ($octets[4] mod 64)))"/>
<xsl:call-template name="internal-utf8">
<xsl:with-param name="octets" select="$octets[position() > 4]"/>
</xsl:call-template>
</xsl:when>
<xsl:when test="count($octets) >= 3 and $octets[1] >= 224 and $octets[2] >= 128 and $octets[3] >= 128">
<xsl:value-of select="codepoints-to-string(xs:integer((((($octets[1] mod 32) * 64) + ($octets[2] mod 32)) * 64) + ($octets[3] mod 64)))"/>
<xsl:call-template name="internal-utf8">
<xsl:with-param name="octets" select="$octets[position() > 3]"/>
</xsl:call-template>
</xsl:when>
<xsl:when test="count($octets) >= 2 and $octets[1] >= 192 and $octets[2] >= 128">
<xsl:value-of select="codepoints-to-string(xs:integer((($octets[1] mod 32) * 64) + ($octets[2] mod 64)))"/>
<xsl:call-template name="internal-utf8">
<xsl:with-param name="octets" select="$octets[position() > 2]"/>
</xsl:call-template>
</xsl:when>
<xsl:when test="$octets[1] < 128">
<xsl:value-of select="codepoints-to-string($octets[1])"/>
<xsl:call-template name="internal-utf8">
<xsl:with-param name="octets" select="$octets[position() > 1]"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<illegal-octet><xsl:value-of select="$octets[1]"/></illegal-octet>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
</xsl:transform>
Output:
<?xml version="1.0" encoding="UTF-8"?> <results xmlns:xs="http://www.w3.org/2001/XMLSchema"> <result> <input> <enc>utf-8</enc> <value>A%20C</value> </input> <parsed> <string>A C</string> </parsed> </result> <result> <input> <enc>iso-8859-1</enc> <value>A%20C</value> </input> <parsed> <string>A C</string> </parsed> </result> <result> <input> <enc>utf-8</enc> <value>A%C3%A4</value> </input> <parsed> <string>Ad</string> </parsed> </result> <result> <input> <enc>iso-8859-1</enc> <value>A%E4</value> </input> <parsed> <string>Ad</string> </parsed> </result> <result> <input> <enc>utf-8</enc> <value>A%E4</value> </input> <parsed> <illegal-octet>228</illegal-octet> </parsed> </result> </results>
Best regards, Julian
Current Thread |
---|
|
<- Previous | Index | Next -> |
---|---|---|
Re: [xsl] decoding percent-escaped , Julian Reschke | Thread | Fw: [xsl] decoding percent-escaped , Hermann Stamm-Wilbra |
Re: [xsl] need xsl template for thi, Matthieu Ricaud-Duss | Date | [xsl] Multiple/conditional import p, Martynas Jusevicius |
Month |
Keywords