[XSL-LIST Mailing List Archive Home] [By Thread] [By Date]

Re: [xsl] Sorting substitution instructions by max. length of matches


Subject: Re: [xsl] Sorting substitution instructions by max. length of matches
From: David Carlisle <davidc@xxxxxxxxx>
Date: Fri, 5 Oct 2007 16:05:05 +0100

I'd probably use analyze-string rather than using an explicit recursion
to do the lookup.  (I think in both cases they may fail to find the
longest match possible, as earlier matches may obscure a possible later
longer match)

something like

<xsl:stylesheet
    version="2.0"
    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns:my="http://xmlns.srz.de/yforkl/xslt/functions"
    exclude-result-prefixes="my xs">

  <!-- using dashes in function names, underscores in variable names -->


  <xsl:template name="main">

    <!-- sample data -->
    <xsl:variable name="input"
		  select="'abcddddxxxxxxxyyyabcxxxxxabcdxabc'"/>

    <!-- sample data -->
    <xsl:variable name="substitution_instructions">
      <substitution>
	<old>[a-e]+</old>
	<new>***</new>
      </substitution>
      <substitution>
	<old>d</old>
	<new>#</new>
      </substitution>
      <substitution>
	<old>123</old>
	<new>...</new>
      </substitution>
      <substitution>
	<old>c+</old>
	<new>#</new>
      </substitution>
      <substitution>
	<old>x+y*</old>
	<new>+++</new>
      </substitution>
    </xsl:variable>

    <xsl:variable name="substitution_instructions_sorted">
      <xsl:for-each select="$substitution_instructions/*">
	<xsl:sort order="descending">
	  <xsl:variable name="lengths" as="xs:integer+">
	    <xsl:sequence select="0"/>
	    <xsl:analyze-string select="$input" regex="{old}">
	      <xsl:matching-substring>
		<xsl:sequence select="string-length(.)"/>
	      </xsl:matching-substring>
	    </xsl:analyze-string>
	  </xsl:variable>
	  <xsl:sequence select="max($lengths)"/>
	</xsl:sort>
	<xsl:variable name="lengths" as="xs:integer+">
	  <xsl:sequence select="0"/>
	  <xsl:analyze-string select="$input" regex="{old}">
	    <xsl:matching-substring>
	      <xsl:sequence select="string-length(.)"/>
	    </xsl:matching-substring>
	  </xsl:analyze-string>
	</xsl:variable>
	<xsl:if test="max($lengths)!=0">
	  <substitution>
	    <max_match_length><xsl:value-of select="max($lengths)"/></max_match_length>
	    <xsl:copy-of select="*"/>
	  </substitution>
	</xsl:if>


      </xsl:for-each>

    </xsl:variable>




    <xsl:message>
      <xsl:value-of
	  select="concat('input: ', $input, '&#10;')"/>
      <xsl:value-of
	  select="'========================================&#10;'"/>
      <xsl:for-each
	  select="$substitution_instructions_sorted/substitution">
	<xsl:value-of
	    select="concat('regex: ', old, '&#10;')"/>
	<xsl:value-of
	    select="concat('max. match length: ', max_match_length, 
		    '&#10;')"/>
	<xsl:value-of
	    select="'----------------------------------------&#10;'"/>
      </xsl:for-each>
    </xsl:message>

  </xsl:template>

</xsl:stylesheet>

$ saxon8 -it main regexlength.xsl~ 
input: abcddddxxxxxxxyyyabcxxxxxabcdxabc
========================================
regex: x+y*
max. match length: 10
----------------------------------------
regex: [a-e]+
max. match length: 7
----------------------------------------
regex: c+
max. match length: 1
----------------------------------------
regex: d
max. match length: 1
----------------------------------------

________________________________________________________________________
The Numerical Algorithms Group Ltd is a company registered in England
and Wales with company number 1249803. The registered office is:
Wilkinson House, Jordan Hill Road, Oxford OX2 8DR, United Kingdom.

This e-mail has been scanned for all viruses by Star. The service is
powered by MessageLabs. 
________________________________________________________________________


Current Thread