access temporary tree when transformation not finished - xslt-3.0

I have a two phases XSLT transformation with xml as output. When I set a breakpoint in one of templates during my first phase and start my xslt transformation in debug mode with XML Spy Professional 2020, I can see an xml structure in XSL Output.xml as the processed result before my template with breakpoint is applied.
My question is, is there a way in one template in the same phase to access this structure, which is a temporary result of transformation, which is not yet completed?
For development I use XML Spy Professional 2020 and for transformation in application I use Saxon Professional Edition SaxonPE9-9-1-3J.
My problem is following:
Input is a plain text https://gist.github.com/jia2/35143e79213864153b57ad0323a440a8#file-input-txt
Based on this format rules https://gist.github.com/jia2/76d676b90935cb7f33f5028180557af3,
the expected XML output like this:
https://gist.github.com/jia2/daaa4b2de5d1dadcb834f9f91c65d45b
Here my template:
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:fn="http://www.w3.org/2005/xpath-functions" xmlns:csb="http://www.dbcargo.org/csb" exclude-result-prefixes="#all" version="3.0">
<!-- <xsl:param name="msg" as="xs:string">H0 EVU_DBSRD PVG Z24 ABF-RF IR ExternalPartnerID_uuuuuuuuuuuuuuuuu0202017-03-16-07.27.40.864320NJNJ M1 80281261300008 M2 16.03.201707:27:00Z1 H62430 16.03.2017 16.03.201707:00:00+0027R1 00131800820664780201703154023641201703151159043706346965 000 JJ R1 02031800819657480201703154045545201703151159306557346965 000 NN </xsl:param> -->
<xsl:param name="msg" as="xs:string">H0 EVU_DBSRD PVG Z24 ABF-RF IR ExternalPartnerID_uuuuuuuuuuuuuuuuu0202017-03-16-07.27.40.864320NJNJJJ M1 80281261300008 M2 16.03.201707:27:00Z1 H62430 16.03.2017 16.03.201707:00:00+0027R1 00131800820664780201703154023641201703151159043706346965 000 JJ R1 02031800819657480201703154045545201703151159306557346965 000 NN </xsl:param>
<xsl:param name="relatviePath2MFL" as="xs:string" select="'./format.xml'"/>
<xsl:variable name="MFL" select="document($relatviePath2MFL)"/>
<xsl:output method="xml" indent="yes"/>
<xsl:mode name="unroll" on-no-match="shallow-copy"/>
<xsl:strip-space elements="*"/>
<xsl:template match="StructFormat[#repeat]" mode="unroll">
<xsl:variable name="this" select="."/>
<xsl:choose>
<xsl:when test="$this/#repeat != '*' ">
<xsl:for-each select="1 to #repeat">
<xsl:choose>
<xsl:when test="$this/#delimOptional = 'n' and $this/TagField and contains($msg, $this/TagField)">
<xsl:copy select="$this">
<xsl:apply-templates select="#* except #repeat, node()" mode="#current"/>
</xsl:copy>
</xsl:when>
<xsl:otherwise/>
</xsl:choose>
</xsl:for-each>
</xsl:when>
<xsl:otherwise>
<xsl:variable name="repeat" select="count(tokenize($msg, $this/TagField/#value)) - 1"/>
<xsl:for-each select="1 to $repeat">
<xsl:copy select="$this">
<xsl:apply-templates select="#* except #repeat, node()" mode="#current"/>
</xsl:copy>
</xsl:for-each>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template match="StructFormat[not(#repeat)]" mode="unroll">
<xsl:variable name="this" select="."/>
<xsl:choose>
<xsl:when test="$this/TagField and not(contains($msg, $this/TagField/#value)) ">
</xsl:when>
<xsl:otherwise>
<xsl:copy select="$this">
<xsl:apply-templates select="#* except #repeat, node()" mode="#current"/>
</xsl:copy>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template match="FieldFormat[#repeat]" mode="unroll">
<xsl:variable name="this" select="."/>
<xsl:for-each select="1 to #repeat">
<xsl:copy select="$this">
<xsl:apply-templates select="#* except #repeat, node()" mode="#current"/>
</xsl:copy>
</xsl:for-each>
</xsl:template>
<xsl:variable name="complete-struct">
<xsl:apply-templates select="$MFL/*" mode="unroll"/>
</xsl:variable>
<xsl:template match="/">
<xsl:element name="{$MFL/MessageFormat/#name}">
<xsl:apply-templates select="$complete-struct/*"/>
</xsl:element>
</xsl:template>
<xsl:template match="StructFormat">
<xsl:element name="{#name}">
<xsl:apply-templates/>
</xsl:element>
</xsl:template>
<xsl:template match="FieldFormat">
<xsl:variable name="precedingFieldFormatsLength" select="sum(preceding::FieldFormat/#length)"/>
<xsl:variable name="offset">
<xsl:value-of select="string-length(string-join(./preceding::TagField/#value, ''))"/>
</xsl:variable>
<xsl:element name="{#name}">
<xsl:variable name="value" select="substring($msg, 1 + $precedingFieldFormatsLength + $offset, #length)"/>
<xsl:value-of select="csb:formatField(.,$value)"/>
</xsl:element>
</xsl:template>
<!-- format output -->
<xsl:function name="csb:formatField" as="xs:string">
<xsl:param name="field" as="element()"/>
<xsl:param name="value" as="xs:string"/>
<xsl:choose>
<xsl:when test="$field/#length = '1' and $value = ' '">
<xsl:value-of select="''"/>
</xsl:when>
<!-- remove leading and trailing space -->
<xsl:when test="$field/#trimLeading = ' ' and $field/#trimTrailing = ' '">
<xsl:value-of select="fn:replace($value, '^\s+|\s+$', '')"/>
</xsl:when>
<!-- remove ONLY leading space -->
<xsl:when test="$field/#trimLeading = ' ' and fn:not(fn:exists($field//#trimTrailing))">
<xsl:value-of select="fn:replace($value, '^\s+', '')"/>
</xsl:when>
<!-- remove ONLY trailing space -->
<xsl:when test="$field/#trimTrailing = ' ' and fn:not(fn:exists($field//#trimLeading))">
<xsl:value-of select="fn:replace($value, '\s+$', '')"/>
</xsl:when>
<!-- remove leading 0 -->
<xsl:when test="$field/#type = 'Numeric' and $field/#trimLeading = '0' and fn:not(fn:exists($field//#trimTrailing))">
<!-- <xsl:value-of select="fn:replace($value, '^0+', '')"/> -->
<xsl:if test="number($value) != number($value)">
<xsl:message terminate="yes" ><xsl:value-of select="concat('Transformation failed. The field', $field, ' has invalid value')" /></xsl:message>
</xsl:if>
<xsl:value-of select="number($value)"/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$value"/>
</xsl:otherwise>
</xsl:choose>
</xsl:function>
</xsl:stylesheet>
My xslt template https://gist.github.com/jia2/5f7387e549e6f83601dbfac23ceb3acf is reading this format.xml as input and the plain text is passed by as parameter. It works for some inputs. but it will fail, when the value, which is used for marking begin of "StructFormat", exist in other positions.
For example:
<StructFormat name='HandoverTakeover' delimOptional='n' optional='y'>
<TagField type='String' value='U1 '/>
This means, StructFormat should be generated, when the input has "U1 " at a position. Now I'm just checking if input text contains "U1 " (<xsl:when test="$this/TagField and not(contains($msg, $this/TagField/#value)) ">), but this is not enough, I need to check, if the "U1 " comes in the "right" position range, not in the whole input.
I though if I can access currently build result tree, I can count the length until now to to cut the text before this position where I am checking.
Thanks
Dingjun

XSLT is a functional language; it therefore disallows operations whose result would depend on the order of execution. The fact that particular processors organise the processing in a particular way (even when two different processors choose the same strategy) doesn't mean it is something that can be relied on; in a few years time, for example, parallel execution strategies may be much more common.
More specifically, the fact that the two phases of your transformation are executing "concurrently" (one starts before the other finishes) is an internal optimization that you cannot exploit or rely on, and this is by design.
No doubt the transformation you are trying to effect can be achieved in some completely different way within the paradigm of a declarative functional language. I haven't studied the particular problem; like many people answering questions on StackOverflow, I'm not prepared to follow links to code that's off-site.

Related

What's the best way to move space-delimited tokens from one attribute to another in XSLT-2.0?

I'm trying to move space-delimited tokens from one attribute to another in XSLT-2.0. For example, given
<!-- SOURCE DOCUMENT -->
<?xml version="1.0" encoding="UTF-8"?>
<root>
<p class="foo"/>
<p class="foo bar baz"/>
<p class="foo bar baz" outputclass="BAR"/>
<p class="foo bar baz" outputclass="BAR HELLO"/>
</root>
I need to move #class="foo" to #outputclass="FOO" and #class="bar" to #outputclass="BAR", deleting the source attribute if it becomes empty and augmenting the target attribute if it exists (simple token-set operations):
<!-- RESULTING DOCUMENT -->
<?xml version="1.0" encoding="UTF-8"?>
<root>
<p outputclass="FOO"/>
<p class="baz" outputclass="FOO BAR"/>
<p class="baz" outputclass="FOO BAR"/>
<p class="baz" outputclass="FOO BAR HELLO"/>
</root>
I think I have everything figured out except the actual token-moving part. Every direction I go down ends up complicated and broken, and I feel like XSLT-2.0 surely has a simple approach that I'm missing.
Here's what I have so far:
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:mine="mine:local"
exclude-result-prefixes="xs"
version="2.0">
<!-- baseline identity transform -->
<!-- (for non-elements - attributes, whitespace PCDATA, etc.) -->
<xsl:template match="#*|(node() except *)">
<xsl:copy>
<xsl:apply-templates select="#*|node()"/>
</xsl:copy>
</xsl:template>
<!-- for element nodes, remap attributes then copy element -->
<xsl:template match="*">
<!-- get original attribute sequence -->
<xsl:variable name="atts1" select="#*"/>
<!-- use our function to remap two attribute tokens -->
<xsl:variable name="atts2" select="mine:remap($atts1, 'class', 'foo', 'outputclass', 'FOO')"/>
<xsl:variable name="atts3" select="mine:remap($atts2, 'class', 'bar', 'outputclass', 'BAR')"/>
<!-- stuff updated attribute sequence into element -->
<xsl:copy>
<xsl:sequence select="$atts3"/>
<xsl:apply-templates select="node()"/>
</xsl:copy>
</xsl:template>
<!-- remap #from_att~="$from_token" to #to_att~="$to_token" -->
<xsl:function name="mine:remap">
<xsl:param name="orig_atts"/>
<xsl:param name="from_att"/>
<xsl:param name="from_token"/>
<xsl:param name="to_att"/>
<xsl:param name="to_token"/>
<!-- ******** TOKEN-MOVING MAGIC!?! ******** -->
<xsl:sequence select="$orig_atts"/>
</xsl:function>
</xsl:stylesheet>
Basically I need to figure out how TOKEN-MOVING MAGIC!?! can move a single token (including deletion of empty "from" attributes). I've searched quite a bit but I haven't seen this particular problem covered.
Edit: The number and names of attributes to remap can be anything, and their values are case-sensitive. It's the magic inside the mine:remap function to remap a single value in an attribute sequence that I'm looking for.
Edit: The reason for approaching attribute modification with a function is that we have a number of different token remappings to apply to different files, and I hoped to allow our non-XSLT-savvy users to easily adjust the remappings to their needs. I was unable to figure out how to provide similar generalization with a template-matching-based approach.
Thanks!
Here is a short XSLT 2.0 solution (just 26 lines):
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output omit-xml-declaration="yes" indent="yes"/>
<xsl:template match="node()|#*">
<xsl:copy>
<xsl:apply-templates select="node()|#*"/>
</xsl:copy>
</xsl:template>
<xsl:template match="p/#class[tokenize(., ' ') = ('foo', 'bar')]">
<xsl:if test="tokenize(., ' ')[not(. = ('foo', 'bar'))]">
<xsl:attribute name="class"
select="string-join(tokenize(., ' ')[not(. = ('foo', 'bar'))], ' ')"/>
</xsl:if>
<xsl:attribute name="outputclass" select=
"upper-case(string-join(
(
tokenize(., ' ')[. = ('foo', 'bar')],
tokenize(../#outputclass, ' ')
[not(lower-case(.) = tokenize(current(), ' '))]
),
' '
)
)"/>
</xsl:template>
<xsl:template match="p/#outputclass[../#class[tokenize(., ' ') = ('foo', 'bar')]]"/>
</xsl:stylesheet>
When this transformation is applied on the provided XML document:
<root>
<p class="foo"/>
<p class="foo bar baz"/>
<p class="foo bar baz" outputclass="BAR"/>
<p class="foo bar baz" outputclass="BAR HELLO"/>
</root>
the wanted, correct result is produced:
<root>
<p outputclass="FOO"/>
<p class="baz" outputclass="FOO BAR"/>
<p class="baz" outputclass="FOO BAR"/>
<p class="baz" outputclass="FOO BAR HELLO"/>
</root>
Update:
Here is the same transformation with almost everything parameterized, as requested in a comment by the OP, just 32 lines:
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output omit-xml-declaration="yes" indent="yes"/>
<xsl:param name="pfromName" select="'class'"/>
<xsl:param name="ptoName" select="'outputclass'"/>
<xsl:param name="pTokens" select="'foo', 'bar'"/>
<xsl:param name="pnewNames" select="'FOO', 'BAR'"/>
<xsl:template match="node()|#*">
<xsl:copy>
<xsl:apply-templates select="node()|#*"/>
</xsl:copy>
</xsl:template>
<xsl:template match="p/#*[name() = $pfromName][tokenize(., ' ') = $pTokens]">
<xsl:if test="tokenize(., ' ')[not(. = $pTokens)]">
<xsl:attribute name="{$pfromName}"
select="string-join(tokenize(., ' ')[not(. = $pTokens)], ' ')"/>
</xsl:if>
<xsl:attribute name="{$ptoName}" select=
"upper-case(string-join(
(
tokenize(., ' ')[. = $pTokens],
tokenize(../#*[name()=$ptoName], ' ')
[not(lower-case(.) = tokenize(current(), ' '))]
),
' '
)
)"/>
</xsl:template>
<xsl:template
match="p/#*[name()=$ptoName][../#*[name()=$pfromName][tokenize(., ' ') = $pTokens]]"/>
</xsl:stylesheet>
Update2:
Here is a completely parameterized XSLT 2.0 transformation (not using the upper-case() and lower-case() functions), just 37 lines:
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output omit-xml-declaration="yes" indent="yes"/>
<xsl:param name="pfromName" select="'class'"/>
<xsl:param name="ptoName" select="'outputclass'"/>
<xsl:param name="pTokens" select="'foo', 'bar'"/>
<xsl:param name="pnewNames" select="'FOO', 'BAR'"/>
<xsl:template match="node()|#*">
<xsl:copy>
<xsl:apply-templates select="node()|#*"/>
</xsl:copy>
</xsl:template>
<xsl:template match="p/#*[name() = $pfromName][tokenize(., ' ') = $pTokens]">
<xsl:if test="tokenize(., ' ')[not(. = $pTokens)]">
<xsl:attribute name="{$pfromName}"
select="string-join(tokenize(., ' ')[not(. = $pTokens)], ' ')"/>
</xsl:if>
<xsl:attribute name="{$ptoName}" select=
"string-join(
distinct-values(
(for $token in tokenize(., ' ')[. = $pTokens],
$n in 1 to count($pTokens),
$ind in $n[$token eq $pTokens[$n]]
return $pnewNames[$ind]
,
tokenize(../#*[name()=$ptoName], ' ')
)
),
' '
)
"/>
</xsl:template>
<xsl:template
match="p/#*[name()=$ptoName][../#*[name()=$pfromName][tokenize(., ' ') = $pTokens]]"/>
</xsl:stylesheet>
In the following sample I have tried to delegate as much as possible to templates:
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
exclude-result-prefixes="#all"
version="3.0">
<xsl:param name="tokens" as="xs:string*"
select="'foo', 'bar'"/>
<xsl:param name="collation" as="xs:string">http://www.w3.org/2005/xpath-functions/collation/html-ascii-case-insensitive</xsl:param>
<xsl:mode on-no-match="shallow-copy"/>
<xsl:template match="*[#class][exists($tokens[contains-token(current()/#class, ., $collation)])]">
<xsl:copy>
<xsl:variable name="new-att" as="attribute()">
<xsl:attribute name="outputclass"/>
</xsl:variable>
<xsl:apply-templates select="#*, $new-att[not(current()/#outputclass)]">
<xsl:with-param name="tokens-found"
select="$tokens[contains-token(current()/#class, ., $collation)]"/>
</xsl:apply-templates>
<xsl:apply-templates/>
</xsl:copy>
</xsl:template>
<xsl:template match="#class">
<xsl:param name="tokens-found"/>
<xsl:variable name="remaining-tokens" select="tokenize(., ' ')[not(. = $tokens-found)]"/>
<xsl:if test="exists($remaining-tokens)">
<xsl:attribute name="{name()}" select="$remaining-tokens"/>
</xsl:if>
</xsl:template>
<xsl:template match="#outputclass">
<xsl:param name="tokens-found"/>
<xsl:variable name="new-tokens" select="$tokens-found[not(contains-token(current(), ., $collation))]"/>
<xsl:attribute name="{name()}" select="$new-tokens, ."/>
</xsl:template>
</xsl:stylesheet>
https://xsltfiddle.liberty-development.net/bEzkTcx/1
I haven't implemented the upper-case transformation of the tokens to be moved, I guess it should be easy to add that.
The code uses XSLT 3 with XPath 3 and the function https://www.w3.org/TR/xpath-functions/#func-contains-token but it has a definition in the spec that you could use in a user-defined XSLT 2 function. It is of course also easy to not declare the identity transformation using xsl:mode but by spelling it out.
XSLT 3 is available with Saxon 9.8 or later for Java and .NET, with Saxon-C for C/C++, with bindings for PHP and Python and with Saxon-JS 2 inside of modern web browsers and for Node.js.
Here is what I ended up with for the mine:remap() function:
<!-- remap #from_att~="$from_token" to #to_att~="$to_token" -->
<xsl:function name="mine:remap">
<xsl:param name="orig_atts" as="attribute()*"/>
<xsl:param name="from_att"/>
<xsl:param name="from_token"/>
<xsl:param name="to_att"/>
<xsl:param name="to_token"/>
<!-- get tokenized list of values of "from" attributes -->
<xsl:variable name="from_att_values" select="tokenize($orig_atts[name() = $from_att], ' ')"/>
<xsl:choose>
<!-- does the "from" attribute contain our value to replace? -->
<xsl:when test="$from_att_values = $from_token">
<!-- if so, iterate through attributes to preserve their order -->
<xsl:for-each select="$orig_atts">
<xsl:choose>
<!-- if "from" and "to" attributes are the same, replace $from_token with $to_token in-place -->
<xsl:when test="(name(.) = $from_att) and ($from_att = $to_att)">
<xsl:attribute name="{name(.)}" select="for $t in $from_att_values
return ($t[$t != $from_token], $to_token[$t = $from_token])"/>
</xsl:when>
<!-- if "from" attribute, define with $from_token value removed -->
<xsl:when test="name(.) = $from_att">
<xsl:variable name="new_from_att_values" select="$from_att_values[not(. = $from_token)]"/>
<xsl:if test="count($new_from_att_values) > 0">
<xsl:attribute name="{$from_att}" select="$new_from_att_values"/>
</xsl:if>
</xsl:when>
<!-- if "to" attribute, define with $to_token value added -->
<xsl:when test="name(.) = $to_att">
<xsl:attribute name="{$to_att}" select="distinct-values((tokenize(., ' '), $to_token))"/>
</xsl:when>
<xsl:otherwise>
<xsl:copy/>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
<!-- if there was no "from" attribute to modify above, create it here -->
<xsl:if test="not($orig_atts[name() = $to_att])">
<xsl:attribute name="{$to_att}" select="$to_token"/>
</xsl:if>
</xsl:when>
<!-- if not, return original attributes -->
<xsl:otherwise>
<xsl:sequence select="$orig_atts"/>
</xsl:otherwise>
</xsl:choose>
</xsl:function>
I iterate through the attributes to preserve their order, then I use xsl:choose to handle the from (remove a token), to (add a token), or other (copy) attributes.

In XSLT, How do I split the text content of an element into lines?

I'm using XSLT to parse the textual content of an XML element. This text contains newlines, but I can't seem to parse them correctly. I'm using code I found online to chop up the text. Here's the relevant part of the code.
<xsl:variable name="first">
<xsl:value-of select="substring-before($source, $newline)"/>
</xsl:variable>
<xsl:variable name="rest">
<xsl:value-of select="substring-after($source, $newline)"/>
</xsl:variable>
This is part of a recusrive template that pushes $rest into itself.
The problem is that the code sample doesn't define $newline.
If I set $newline to a letter, like 's', the text gets split up just fine (e.g. it will turn the input "resounding" into "re" and "ounding"). But when I try to set $newline to the newline character, that is
or  , it recurses forever and gives me a stack overflow. I also tried to define an ENTITY for newline but it makes no difference.
The input has ordinary CR/LF at the end of each line (I'm on a Windows box).
What am I doing wrong?
If you can use EXSLT try with str:tokenize
<xsl:for-each select="str:tokenize($source, $newline)">
<xsl:value-of select="."/>
<xsl:text>
</xsl:text>
</xsl:for-each>
Or similarly with XSLT 2.0:
<xsl:for-each select="tokenize($source, $newline)">
<xsl:sequence select="."/>
<xsl:text>
</xsl:text>
</xsl:for-each>
You may be able to use the below.
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" indent="yes"/>
<xsl:template match="/">
<root>
<xsl:for-each select="root/str">
<str>
<xsl:call-template name="strSplit">
<xsl:with-param name="str" select="."/>
<xsl:with-param name="seqno" select="1"/>
</xsl:call-template>
</str>
</xsl:for-each>
</root>
</xsl:template>
<xsl:template name="strSplit">
<xsl:param name="str"/>
<xsl:param name="seqno"/>
<xsl:variable name="afterLeadingWS"
select="substring-after($str, substring-before($str,substring-before(normalize-space($str), ' ')))"/>
<xsl:choose>
<xsl:when test="contains($afterLeadingWS, '
')">
<line>
<xsl:attribute name="seqno"><xsl:value-of select="$seqno"/></xsl:attribute>
<xsl:attribute name="length"><xsl:value-of select="string-length(substring-before($afterLeadingWS, '
'))"/></xsl:attribute>
<xsl:value-of select="substring-before($afterLeadingWS, '
')"/>
</line>
<xsl:call-template name="strSplit">
<xsl:with-param name="str" select="substring-after($afterLeadingWS, '
')"/>
<xsl:with-param name="seqno" select="$seqno + 1"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<line>
<xsl:attribute name="seqno"><xsl:value-of select="$seqno"/></xsl:attribute>
<xsl:value-of select="$afterLeadingWS"/>
</line>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
</xsl:stylesheet>
Applied to
<?xml version="1.0" encoding="UTF-8"?>
<root>
<str>
yigifgniuq h
eukwgf kuew hgk.uhgku
,/v.,silghouihhg
</str>
<str>
09734ymmnyr n.0808
o149013483ymr7rg
738924m c0
</str>
</root>
the output result is
<?xml version="1.0" encoding="UTF-8"?>
<root>
<str>
<line seqno="1" length="13">yigifgniuq h </line>
<line seqno="2" length="21">eukwgf kuew hgk.uhgku</line>
<line seqno="3" length="18"> ,/v.,silghouihhg</line>
<line seqno="4"> </line>
</str>
<str>
<line seqno="1" length="18">09734ymmnyr n.0808</line>
<line seqno="2" length="16">o149013483ymr7rg</line>
<line seqno="3" length="11">738924m c0 </line>
<line seqno="4" length="2"> </line>
<line seqno="5"> </line>
</str>
</root>
Note that leading tabs (or blanks) are seen as part of lines.
Maestro13's answer brought me closest, and I ended up merging the template I had with his, to produce this, which I share here for future generations. It's a template that returns the length of the longest line in the string you pass to it.
<xsl:template name="longestCodeLine">
<xsl:param name="str"/>
<xsl:choose>
<!-- Is this the last line? -->
<xsl:when test="contains($str, '
')">
<!-- No. First isolate all remaining lines, and recurse to find its longest line. -->
<xsl:variable name="bestOfTheRest">
<xsl:call-template name="longestCodeLine">
<xsl:with-param name="str" select="substring-after($str, '
')"/>
</xsl:call-template>
</xsl:variable>
<xsl:choose>
<!-- Compare the longest of the remaining lines to this one. Which one's longer? -->
<!-- If the longest of the remaining lines is longer, return that line. -->
<xsl:when test="string-length($bestOfTheRest) > string-length(substring-before($str, '
'))">
<xsl:value-of select="$bestOfTheRest"/>
</xsl:when>
<!-- If this line longer, return this line. -->
<xsl:otherwise>
<xsl:value-of select="substring-before($str, '
')"/>
</xsl:otherwise>
</xsl:choose>
</xsl:when>
<!-- If there are no \n's left, this is your last string. So it is by definition the longest one left. -->
<xsl:otherwise>
<xsl:value-of select="$str"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
I have used this template once. It is a named template so you can call it where ever you need it. The text here is split up in 70 character pieces:
<xsl:template name="Texts">
<xsl:param name="string" select="TEXTITEM" />
<xsl:param name="line-length" select="70"/>
<xsl:variable name="line" select="substring($string,1,$line-length)"/>
<xsl:variable name="rest" select="substring($string, $line-length+1)"/>
<xsl:if test="$line">
<MYTEXT>
<xsl:value-of select="$line"/>
</MYTEXT>
</xsl:if>
<xsl:if test="$rest">
<xsl:call-template name="Texts">
<xsl:with-param name="string" select="$rest"/>
<xsl:with-param name="line-length" select="$line-length"/>
</xsl:call-template>
</xsl:if>
</xsl:template>
Thought I'd add a line splitting code that adds newlines after white space.
<xsl:function name="kode:splitLongLine">
<xsl:param name="string"/>
<xsl:variable name="regex">
<xsl:text>(((.){1,55})( |$))</xsl:text>
</xsl:variable>
<xsl:variable name="result">
<xsl:analyze-string select="$string" regex="{$regex}">
<xsl:matching-substring>
<xsl:value-of select="concat(regex-group(1),'
')"/>
</xsl:matching-substring>
<xsl:non-matching-substring>
<xsl:value-of select="concat('REPORT ERROR: ', .)"/>
</xsl:non-matching-substring>
</xsl:analyze-string>
</xsl:variable>
<xsl:sequence select="$result"/>
</xsl:function>

XSL Multiple search and replace function

I am attempting to use the XSL translate() function to create something like a search and replace function as follows:
<xsl:template name="create-id">
<xsl:param name="id" />
<xsl:call-template name="search-and-replace">
<xsl:with-param name="str" select="$id" />
<xsl:with-param name="search">0123456789</xsl:with-param>
<xsl:with-param name="replace">abcdefghij</xsl:with-param>
</xsl:call-template>
</xsl:template>
<xsl:template name="search-and-replace">
<xsl:param name="str" />
<xsl:param name="search" />
<xsl:param name="replace" />
<xsl:variable name="newstr" select="translate($str, $search,
$replace)" />
<xsl:choose>
<xsl:when test="contains($newstr, $search)">
<xsl:call-template name="search-and-replace">
<xsl:with-param name="str" select="$newstr" />
<xsl:with-param name="search" select="$search" />
<xsl:with-param name="replace" select="$replace" />
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$newstr" />
</xsl:otherwise>
</xsl:choose>
</xsl:template>
However, something about my logic is wrong here as it appears to be stripping off the last character in the returned string. My guess is that translate() is only replacing the first instance of each character in the string and is not truly recursive.
Any thoughts or input would be appreciated.
The translate() function can only replace a single character with another single character (or with the empty character (delete)). Thus it cannot solve the problem of string replacement.
Here is a complete XSLT 1.0 solution to the multiple-replace problem:
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:my="my:my">
<xsl:output omit-xml-declaration="yes"/>
<xsl:strip-space elements="*"/>
<my:params xml:space="preserve">
<pattern>
<old>
</old>
<new><br/></new>
</pattern>
<pattern>
<old>quick</old>
<new>slow</new>
</pattern>
<pattern>
<old>fox</old>
<new>elephant</new>
</pattern>
<pattern>
<old>brown</old>
<new>white</new>
</pattern>
</my:params>
<xsl:variable name="vPats"
select="document('')/*/my:params/*"/>
<xsl:template match="text()" name="multiReplace">
<xsl:param name="pText" select="."/>
<xsl:param name="pPatterns" select="$vPats"/>
<xsl:if test="string-length($pText) >0">
<xsl:variable name="vPat" select=
"$vPats[starts-with($pText, old)][1]"/>
<xsl:choose>
<xsl:when test="not($vPat)">
<xsl:copy-of select="substring($pText,1,1)"/>
</xsl:when>
<xsl:otherwise>
<xsl:copy-of select="$vPat/new/node()"/>
</xsl:otherwise>
</xsl:choose>
<xsl:call-template name="multiReplace">
<xsl:with-param name="pText" select=
"substring($pText, 1 + not($vPat) + string-length($vPat/old/node()))"/>
</xsl:call-template>
</xsl:if>
</xsl:template>
</xsl:stylesheet>
when this transformation is applied on the following XML document:
<t>The quick
brown fox</t>
the wanted, correct result is produced:
The slow<br />white elephant
Explanation:
A named template that calls itself recursively is used.
All multiple replacement pattern --> replacement pairs are provided in a single external parameter, which for convenience here is specified inline as the global-level element <my:params> .
The recursion takes every single character in the source string (from left to right) and finds the first pattern that starts with this character at this position in the string.
The replacement can be not only a string but also any node. In this specific case we are replacing every NL character with a <br/> element.
The definition of the translate($arg, $mapString, $transString) function is:
Returns the value of $arg modified so that
every character in the value of $arg
that occurs at some position N in the
value of $mapString has been replaced
by the character that occurs at
position N in the value of
$transString.
That is, it does not replace a substring with another string, but rather maps characters to other characters. For substring replacement, use something like
<xsl:template name="search-and-replace">
<xsl:param name="str"/>
<xsl:param name="search"/>
<xsl:param name="replace"/>
<xsl:choose>
<xsl:when test="contains($str, $search)">
<xsl:value-of select="substring-before($str, $search)"/>
<xsl:value-of select="$replace"/>
<xsl:call-template name="search-and-replace">
<xsl:with-param name="str" select="substring-after($str, $search)"/>
<xsl:with-param name="search" select="$search"/>
<xsl:with-param name="replace" select="$replace"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$str"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
Seems like your will never be true as you have already replaced every characters in $search to $replace in
<xsl:variable name="newstr" select="translate($str, $search,
$replace)" />
beforehand.
str:replace from exslt. It does exactly what you want, and is already written and testing by some xslt gurus. The xslt source is also available there.

How can I speed up my 'divide and conquer' XSLT template which replaces certain characters in a string?

UPDATE: I added an answer to this question which incorporates almost all the suggestions which have been given. The original template given in the code below needed 45605ms to finish a real world input document (english text about script programming). The revised template in the community wiki answer brought the runtime down to 605ms!
I'm using the following XSLT template for replacing a few special characters in a string with their escaped variants; it calls itself recursively using a divide-and-conquer strategy, eventually looking at every single character in a given string. It then decides whether the character should be printed as it is, or whether any form of escaping is necessary:
<xsl:template name="escape-text">
<xsl:param name="s" select="."/>
<xsl:param name="len" select="string-length($s)"/>
<xsl:choose>
<xsl:when test="$len >= 2">
<xsl:variable name="halflen" select="round($len div 2)"/>
<xsl:variable name="left">
<xsl:call-template name="escape-text">
<xsl:with-param name="s" select="substring($s, 1, $halflen)"/>
<xsl:with-param name="len" select="$halflen"/>
</xsl:call-template>
</xsl:variable>
<xsl:variable name="right">
<xsl:call-template name="escape-text">
<xsl:with-param name="s" select="substring($s, $halflen + 1)"/>
<xsl:with-param name="len" select="$halflen"/>
</xsl:call-template>
</xsl:variable>
<xsl:value-of select="concat($left, $right)"/>
</xsl:when>
<xsl:otherwise>
<xsl:choose>
<xsl:when test="$s = '"'">
<xsl:text>"\""</xsl:text>
</xsl:when>
<xsl:when test="$s = '#'">
<xsl:text>"#"</xsl:text>
</xsl:when>
<xsl:when test="$s = '|'">
<xsl:text>"|"</xsl:text>
</xsl:when>
<xsl:when test="$s = '#'">
<xsl:text>"#"</xsl:text>
</xsl:when>
<xsl:when test="$s = '\'">
<xsl:text>"\\"</xsl:text>
</xsl:when>
<xsl:when test="$s = '}'">
<xsl:text>"}"</xsl:text>
</xsl:when>
<xsl:when test="$s = '&'">
<xsl:text>"&"</xsl:text>
</xsl:when>
<xsl:when test="$s = '^'">
<xsl:text>"^"</xsl:text>
</xsl:when>
<xsl:when test="$s = '~'">
<xsl:text>"~"</xsl:text>
</xsl:when>
<xsl:when test="$s = '/'">
<xsl:text>"/"</xsl:text>
</xsl:when>
<xsl:when test="$s = '{'">
<xsl:text>"{"</xsl:text>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$s"/>
</xsl:otherwise>
</xsl:choose>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
This template accounts for the majority of runtime which my XSLT script needs. Replacing the above escape-text template with just
<xsl:template name="escape-text">
<xsl:param name="s" select="."/>
<xsl:value-of select="$s"/>
</xsl:template>
makes the runtime of my XSLT script go from 45 seconds to less than one seconds on one of my documents.
Hence my question: how can I speed up my escape-text template? I'm using xsltproc and I'd prefer a pure XSLT 1.0 solution. XSLT 2.0 solutions would be welcome too. However, external libraries might not be useful for this project - I'd still be interested in any solutions using them though.
Another (complementary) strategy would be to terminate the recursion early, before the string length is down to 1, if the condition translate($s, $vChars, '') = $s is true. This should give much faster processing of strings that contain no special characters at all, which is probably the majority of them. Of course the results will depend on how efficient xsltproc's implementation of translate() is.
A very small correction improved the speed in my tests about 17 times.
There are additional improvements, but I guess this will suffice for now ... :)
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:my="my:my">
<xsl:output omit-xml-declaration="yes" indent="yes"/>
<xsl:strip-space elements="*"/>
<xsl:variable name="vChars">"#|#\}&^~/{</xsl:variable>
<xsl:template match="node()|#*">
<xsl:copy>
<xsl:apply-templates select="node()|#*"/>
</xsl:copy>
</xsl:template>
<xsl:template match="text()" name="escape-text">
<xsl:param name="s" select="."/>
<xsl:param name="len" select="string-length($s)"/>
<xsl:choose>
<xsl:when test="$len >= 2">
<xsl:variable name="halflen" select="round($len div 2)"/>
<xsl:variable name="left">
<xsl:call-template name="escape-text">
<xsl:with-param name="s" select="substring($s, 1, $halflen)"/>
<xsl:with-param name="len" select="$halflen"/>
</xsl:call-template>
</xsl:variable>
<xsl:variable name="right">
<xsl:call-template name="escape-text">
<xsl:with-param name="s" select="substring($s, $halflen + 1)"/>
<xsl:with-param name="len" select="$halflen"/>
</xsl:call-template>
</xsl:variable>
<xsl:value-of select="concat($left, $right)"/>
</xsl:when>
<xsl:otherwise>
<xsl:choose>
<xsl:when test="not(contains($vChars, $s))">
<xsl:value-of select="$s"/>
</xsl:when>
<xsl:when test="$s = '"'">
<xsl:text>"\""</xsl:text>
</xsl:when>
<xsl:when test="$s = '#'">
<xsl:text>"#"</xsl:text>
</xsl:when>
<xsl:when test="$s = '|'">
<xsl:text>"|"</xsl:text>
</xsl:when>
<xsl:when test="$s = '#'">
<xsl:text>"#"</xsl:text>
</xsl:when>
<xsl:when test="$s = '\'">
<xsl:text>"\\"</xsl:text>
</xsl:when>
<xsl:when test="$s = '}'">
<xsl:text>"}"</xsl:text>
</xsl:when>
<xsl:when test="$s = '&'">
<xsl:text>"&"</xsl:text>
</xsl:when>
<xsl:when test="$s = '^'">
<xsl:text>"^"</xsl:text>
</xsl:when>
<xsl:when test="$s = '~'">
<xsl:text>"~"</xsl:text>
</xsl:when>
<xsl:when test="$s = '/'">
<xsl:text>"/"</xsl:text>
</xsl:when>
<xsl:when test="$s = '{'">
<xsl:text>"{"</xsl:text>
</xsl:when>
</xsl:choose>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
</xsl:stylesheet>
Here is a more improved version, based on #Dimitre's answer:
<xsl:template match="text()" name="escape-text">
<xsl:param name="s" select="."/>
<xsl:param name="len" select="string-length($s)"/>
<xsl:choose>
<xsl:when test="$len > 1">
<xsl:variable name="halflen" select="round($len div 2)"/>
<!-- no "left" and "right" variables necessary! -->
<xsl:call-template name="escape-text">
<xsl:with-param name="s" select="substring($s, 1, $halflen)"/>
</xsl:call-template>
<xsl:call-template name="escape-text">
<xsl:with-param name="s" select="substring($s, $halflen + 1)"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:choose>
<xsl:when test="not(contains($vChars, $s))">
<xsl:value-of select="$s"/>
</xsl:when>
<xsl:when test="contains('\"', $s)">
<xsl:value-of select="concat('"\', $s, '"')" />
</xsl:when>
<!-- all other cases can be collapsed, this saves some time -->
<xsl:otherwise>
<xsl:value-of select="concat('"', $s, '"')" />
</xsl:otherwise>
</xsl:choose>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
Should be another tiny bit faster, but I have not benchmarked it. In any case, it's shorter. ;-)
For what it's worth, here's my current version of the escape-text template which incorporates most of the (excellent!) suggestions which people have given in response to my question. For the record, my original version took about 45605ms on average on my sample DocBook document. After that, the runtime was decreased in multiple steps:
Removing the left and right variable together with the concat() call brought the runtime down to 13052ms; this optimization was taken from Tomalak's answer.
Moving the common case (which is: the given character doesn't need any special escaping) first in the inner <xsl:choose> element brought the runtime further down to 5812ms. This optimization was first suggested by Dimitre.
Aborting the recursion early by first testing whether the given string contains any of the special characters at all brought the runtime down to 612ms. This optimization was suggested by Michael.
Finally, I couldn't resist doing a micro optimization after reading a comment by Dimitre in Tomalak's answer: I replaced the <xsl:value-of select="concat('x', $s, 'y')"/> calls with <xsl:text>x</xsl:text><xsl:value-of select="$s"/><xsl:text>y</xsl:text>. This brought the runtime to about 606ms (so about 1% improvement).
In the end, the function took 606ms instead of 45605ms. Impressive!
<xsl:variable name="specialLoutChars">"#|#\}&^~/{</xsl:variable>
<xsl:template name="escape-text">
<xsl:param name="s" select="."/>
<xsl:param name="len" select="string-length($s)"/>
<xsl:choose>
<!-- Common case optimization:
no need to recurse if there are no special characters -->
<xsl:when test="translate($s, $specialLoutChars, '') = $s">
<xsl:value-of select="$s"/>
</xsl:when>
<!-- String length greater than 1, use DVC pattern -->
<xsl:when test="$len > 1">
<xsl:variable name="halflen" select="round($len div 2)"/>
<xsl:call-template name="escape-text">
<xsl:with-param name="s" select="substring($s, 1, $halflen)"/>
<xsl:with-param name="len" select="$halflen"/>
</xsl:call-template>
<xsl:call-template name="escape-text">
<xsl:with-param name="s" select="substring($s, $halflen + 1)"/>
<xsl:with-param name="len" select="$len - $halflen"/>
</xsl:call-template>
</xsl:when>
<!-- Special character -->
<xsl:otherwise>
<xsl:text>"</xsl:text>
<!-- Backslash and quot need backslash escape -->
<xsl:if test="$s = '"' or $s = '\'">
<xsl:text>\</xsl:text>
</xsl:if>
<xsl:value-of select="$s"/>
<xsl:text>"</xsl:text>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
How about using EXSLT? The String functions in EXSLT have a function called replace. I think it is something that is supported by quite a few XSLT implementations.
Update: I fixed this to actually work; now, it is not a speedup!
Building off #Wilfred's answer...
After fiddling with the EXSLT replace() function, I decided it was interesting enough to post another answer, even if it's not useful to the OP. It may well be useful to others.
It's interesting because of the algorithm: instead of the main algorithm worked on here (doing a binary recursive search, dividing in half at each recursion, pruned whenever a 2^nth substring has no special characters in it, and iterating over a choice of special characters when a length=1 string does contain a special character), Jeni Tennison's EXSLT algorithm puts the iteration over a set of search strings on the outside loop. Therefore on the inside of the loop, it is only searching for one string at a time, and can use substring-before()/substring-after() to divide the string, instead of blindly dividing in half.
[Deprecated: I guess that's enough to speed it up significantly. My tests show a speedup of 2.94x over #Dimitre's most recent one (avg. 230ms vs. 676ms).] I was testing using Saxon 6.5.5 in the Oxygen XML profiler. As input I used a 7MB XML document that was mostly a single text node, created from web pages about javascript, repeated. It sounds to me like that is representative of the task that the OP was trying to optimize. I'd be interested to see hear what results others get, with their test data and environments.
Dependencies
This uses an XSLT implementation of replace which relies on exsl:node-set(). It looks like xsltproc supports this extension function (possibly an early version of it). So this may work out-of-the-box for you, #Frerich; and for other processors, as it did with Saxon.
However if we want 100% pure XSLT 1.0, I think it would not be too hard to modify this replace template to work without exsl:node-set(), as long as the 2nd and 3rd params are passed in as nodesets, not RTFs.
Here is the code I used, which calls the replace template. Most of the length is taken up with the verbose way I created search/replace nodesets... that could probably be shortened. (But you can't make the search or replace nodes attributes, as the replace template is currently written. You'll get an error about trying to put attributes under the document element.)
<xsl:stylesheet version="1.0" xmlns:str="http://exslt.org/strings"
xmlns:foo="http://www.foo.net/something" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:import href="lars.replace.template.xsl"/>
<foo:replacements>
<replacement>
<search>"</search>
<replace>"\""</replace>
</replacement>
<replacement>
<search>\</search>
<replace>"\\"</replace>
</replacement>
<replacement>
<search>#</search>
<replace>"["</replace>
</replacement>
<replacement>
<search>|</search>
<replace>"["</replace>
</replacement>
<replacement>
<search>#</search>
<replace>"["</replace>
</replacement>
<replacement>
<search>}</search>
<replace>"}"</replace>
</replacement>
<replacement>
<search>&</search>
<replace>"&"</replace>
</replacement>
<replacement>
<search>^</search>
<replace>"^"</replace>
</replacement>
<replacement>
<search>~</search>
<replace>"~"</replace>
</replacement>
<replacement>
<search>/</search>
<replace>"/"</replace>
</replacement>
<replacement>
<search>{</search>
<replace>"{"</replace>
</replacement>
</foo:replacements>
<xsl:template name="escape-text" match="text()" priority="2">
<xsl:call-template name="str:replace">
<xsl:with-param name="string" select="."/>
<xsl:with-param name="search"
select="document('')/*/foo:replacements/replacement/search/text()"/>
<xsl:with-param name="replace"
select="document('')/*/foo:replacements/replacement/replace/text()"/>
</xsl:call-template>
</xsl:template>
<xsl:template match="node()|#*">
<xsl:copy>
<xsl:apply-templates select="node()|#*"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
The imported stylesheet was originally this one.
However, as #Frerich pointed out, that never gave the correct output!
That ought to teach me not to post performance figures without checking for correctness!
I can see in a debugger where it's going wrong, but I don't know whether the EXSLT template never worked, or if it just doesn't work in Saxon 6.5.5... either option would be surprising.
In any case, EXSLT's str:replace() is specified to do more than we need, so I modified it so as to
require that the input parameters are already nodesets
as a consequence, not require exsl:node-set()
not sort the search strings by length (they're all one character, in this application)
not insert a replacement string between every pair of characters when the corresponding search string is empty
Here is the modified replace template:
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:str="http://exslt.org/strings">
<!-- By Lars Huttar
based on implementation of EXSL str:replace() by Jenni Tennison.
http://www.exslt.org/str/functions/replace/str.replace.template.xsl
Modified by Lars not to need exsl:node-set(), not to bother sorting
search strings by length (in our application, all the search strings are of
length 1), and not to put replacements between every other character
when a search string is length zero.
Search and replace parameters must both be nodesets.
-->
<xsl:template name="str:replace">
<xsl:param name="string" select="''" />
<xsl:param name="search" select="/.." />
<xsl:param name="replace" select="/.." />
<xsl:choose>
<xsl:when test="not($string)" />
<xsl:when test="not($search)">
<xsl:value-of select="$string" />
</xsl:when>
<xsl:otherwise>
<xsl:variable name="search1" select="$search[1]" />
<xsl:variable name="replace1" select="$replace[1]" />
<xsl:choose>
<xsl:when test="contains($string, $search1)">
<xsl:call-template name="str:replace">
<xsl:with-param name="string"
select="substring-before($string, $search1)" />
<xsl:with-param name="search"
select="$search[position() > 1]" />
<xsl:with-param name="replace"
select="$replace[position() > 1]" />
</xsl:call-template>
<xsl:value-of select="$replace1" />
<xsl:call-template name="str:replace">
<xsl:with-param name="string"
select="substring-after($string, $search)" />
<xsl:with-param name="search" select="$search" />
<xsl:with-param name="replace" select="$replace" />
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:call-template name="str:replace">
<xsl:with-param name="string" select="$string" />
<xsl:with-param name="search"
select="$search[position() > 1]" />
<xsl:with-param name="replace"
select="$replace[position() > 1]" />
</xsl:call-template>
</xsl:otherwise>
</xsl:choose>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
</xsl:stylesheet>
One of the side benefits of this simpler template is that you could now use attributes for the nodes of your search and replace parameters. This would make the <foo:replacements> data more compact and easier to read IMO.
Performance: With this revised template, the job gets done in about 2.5s, vs. my 0.68s for my recent tests of the leading competitor, #Dimitre's XSLT 1.0 stylesheet. So it's not a speedup. But again, others have had very different test results than I have, so I'd like to hear what others get with this stylesheet.
After #Frerich-Raabe published a community wiki answer which combines the suggestions so far and achieves (on his data) a speedup of 76 times -- big congratulations to everybody!!!
I couldn't resist not to go further:
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:variable name="specialLoutChars">"#|#\}&^~/{</xsl:variable>
<xsl:key name="kTextBySpecChars" match="text()"
use="string-length(translate(., '"#|#\}&^~/', '') = string-length(.))"/>
<xsl:template match="node()|#*">
<xsl:copy>
<xsl:apply-templates select="node()|#*"/>
</xsl:copy>
</xsl:template>
<xsl:template match="text()[key('kTextBySpecChars', 'true')]" name="escape-text">
<xsl:param name="s" select="."/>
<xsl:param name="len" select="string-length($s)"/>
<xsl:choose>
<xsl:when test="$len >= 2">
<xsl:variable name="halflen" select="round($len div 2)"/>
<xsl:call-template name="escape-text">
<xsl:with-param name="s" select="substring($s, 1, $halflen)"/>
<xsl:with-param name="len" select="$halflen"/>
</xsl:call-template>
<xsl:call-template name="escape-text">
<xsl:with-param name="s" select="substring($s, $halflen + 1)"/>
<xsl:with-param name="len" select="$len - $halflen"/>
</xsl:call-template>
</xsl:when>
<xsl:when test="$len = 1">
<xsl:choose>
<!-- Common case: the character at hand needs no escaping at all -->
<xsl:when test="not(contains($specialLoutChars, $s))">
<xsl:value-of select="$s"/>
</xsl:when>
<xsl:when test="$s = '"' or $s = '\'">
<xsl:text>"\</xsl:text>
<xsl:value-of select="$s"/>
<xsl:text>"</xsl:text>
</xsl:when>
<xsl:otherwise>
<xsl:text>"</xsl:text>
<xsl:value-of select="$s"/>
<xsl:text>"</xsl:text>
</xsl:otherwise>
</xsl:choose>
</xsl:when>
</xsl:choose>
</xsl:template>
</xsl:stylesheet>
This transformation achieves (on my data) a further speedup of 1.5 times. So the total speedup should be more than 100 times.
OK, I'll chip in. Though not as interesting as optimizing the XSLT 1.0 version, you did say that XSLT 2.0 solutions are welcome, so here's mine.
<xsl:stylesheet version="2.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:template name="escape-text" match="text()" priority="2">
<xsl:variable name="regex1">[#|#}&^~/{]</xsl:variable>
<xsl:variable name="replace1">"$0"</xsl:variable>
<xsl:variable name="regex2">["\\]</xsl:variable>
<xsl:variable name="replace2">"\\$0"</xsl:variable>
<xsl:value-of select='replace(replace(., $regex2, $replace2),
$regex1, $replace1)'/>
</xsl:template>
<xsl:template match="node()|#*">
<xsl:copy>
<xsl:apply-templates select="node()|#*"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
This just uses a regexp replace() to replace \ or " with "\" or "\"" respectively; composed with another regexp replace() to surround any of the other escapable characters with quotes.
In my tests, this performs worse than Dimitre's most recent XSLT 1.0 offering, by a factor of more than 2. (But I made up my own test data, and other conditions may be idiosyncratic, so I'd like to know what results others get.)
Why the slower performance? I can only guess it's because searching for regular expressions is slower than searching for fixed strings.
Update: using analyze-string
As per #Alejandro's suggestion, here it is using analyze-string:
<xsl:template name="escape-text" match="text()" priority="2">
<xsl:analyze-string select="." regex='([#|#}}&^~/{{])|(["\\])'>
<xsl:matching-substring>
<xsl:choose>
<xsl:when test="regex-group(1)">"<xsl:value-of select="."/>"</xsl:when>
<xsl:otherwise>"\<xsl:value-of select="."/>"</xsl:otherwise>
</xsl:choose>
</xsl:matching-substring>
<xsl:non-matching-substring><xsl:value-of select="."/></xsl:non-matching-substring>
</xsl:analyze-string>
</xsl:template>
While this seems like a good idea, unfortunately it does not give us a performance win: In my setup, it consistently takes about 14 seconds to complete, versus 1 - 1.4 sec for the replace() template above. Call that a 10-14x slowdown. :-( This suggests to me that breaking and concatenating lots of big strings at the XSLT level is a lot more expensive than traversing a big string twice in a built-in function.

How to format a string to Pascal case in XSLT?

I'm trying to format strings in XSLT that needs to be in pascal case to be used appropriately for the application I'm working with.
For example:
this_text would become ThisText
this_long_text would become ThisLongText
Is it possible to also set this up where I can send an input to the format so I do not have to recreate the format multiple times?
This transformation:
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output omit-xml-declaration="yes" indent="yes"/>
<xsl:variable name="vLower" select=
"'abcdefghijklmnopqrstuvwxyz'"/>
<xsl:variable name="vUpper" select=
"'ABCDEFGHIJKLMNOPQRSTUVWXYZ'"/>
<xsl:template match="node()|#*">
<xsl:copy>
<xsl:apply-templates select="node()|#*"/>
</xsl:copy>
</xsl:template>
<xsl:template match="text()">
<xsl:call-template name="Pascalize">
<xsl:with-param name="pText" select="concat(., '_')"/>
</xsl:call-template>
</xsl:template>
<xsl:template name="Pascalize">
<xsl:param name="pText"/>
<xsl:if test="$pText">
<xsl:value-of select=
"translate(substring($pText,1,1), $vLower, $vUpper)"/>
<xsl:value-of select="substring-before(substring($pText,2), '_')"/>
<xsl:call-template name="Pascalize">
<xsl:with-param name="pText"
select="substring-after(substring($pText,2), '_')"/>
</xsl:call-template>
</xsl:if>
</xsl:template>
</xsl:stylesheet>
when applied on this XML document:
<t>
<a>this_text</a>
<b>this_long_text</b>
</t>
produces the desired result:
<t>
<a>ThisText</a>
<b>ThisLongText</b>
</t>
BTW, this is camelCase and this is PascalCase
Here, two years after the fact, is an XSLT 2.0 solution:
<xsl:function name="fn:pascal-case">
<xsl:param name="string"/>
<xsl:value-of select="string-join(for $s in tokenize($string,'\W+') return concat(upper-case(substring($s,1,1)),substring($s,2)),'')"/>
</xsl:function>
It will pascalize either 'this_long_text' or 'this-long-text' to 'ThisLongText' because it breaks on any non-word characters.
In the regex flavors I am most familiar with (perl, pcre, etc.), an underscore is considered part of the '\w' character class (therefore not part of \W), but for XSLT 2.0 the XSD datatypes are used (http://www.w3.org/TR/xmlschema-2/) and '\w' is defined as:
[#x0000-#x10FFFF]-[\p{P}\p{Z}\p{C}] (all characters except the set of "punctuation", "separator" and "other" characters)
so '\W' includes an underscore.
This version worked for me. I added a choose that outputs "the rest" of the string when no more underbars are present.
<xsl:variable name="vLower" select="'abcdefghijklmnopqrstuvwxyz'"/>
<xsl:variable name="vUpper" select="'ABCDEFGHIJKLMNOPQRSTUVWXYZ'"/>
<xsl:template name="Pascalize">
<xsl:param name="pText" />
<xsl:if test="$pText">
<xsl:value-of select="translate(substring($pText,1,1), $vLower, $vUpper)" />
<xsl:choose>
<xsl:when test="contains($pText, '_')">
<xsl:value-of select="substring-before(substring($pText,2), '_')" />
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="substring($pText,2)" />
</xsl:otherwise>
</xsl:choose>
<xsl:call-template name="Pascalize">
<xsl:with-param name="pText" select="substring-after(substring($pText,2), '_')" />
</xsl:call-template>
</xsl:if>
</xsl:template>
Also, in case anyone comes here looking for the reverse process (which I happened to also require today and could find not a single example of anywhere)...
<xsl:variable name="vLower" select="'abcdefghijklmnopqrstuvwxyz'"/>
<xsl:variable name="vUpper" select="'ABCDEFGHIJKLMNOPQRSTUVWXYZ'"/>
<xsl:template name="TitleCase">
<xsl:param name="pText" />
<xsl:call-template name="TitleCase_recurse">
<xsl:with-param name="pText" select="concat(translate(substring($pText,1,1), $vLower, $vUpper), substring($pText,2))" />
</xsl:call-template>
</xsl:template>
<xsl:template name="TitleCase_recurse">
<xsl:param name="pText" />
<xsl:if test="string-length($pText) > 1">
<xsl:if test="not(substring($pText,1,1) = ' ' and substring($pText,1,1) = ' ')">
<xsl:value-of select="substring($pText,1,1)" />
</xsl:if>
<xsl:if test="translate(substring($pText,1,1), $vLower, $vUpper) != substring($pText,1,1)">
<xsl:if test="translate(substring($pText,2,1), $vLower, $vUpper) = substring($pText,2,1)">
<xsl:text> </xsl:text>
</xsl:if>
</xsl:if>
<xsl:call-template name="TitleCase_recurse">
<xsl:with-param name="pText" select="substring($pText,2)" />
</xsl:call-template>
</xsl:if>
<xsl:if test="string-length($pText) = 1">
<xsl:value-of select="$pText" />
</xsl:if>
</xsl:template>
I love it when my subconscious brain pops up an answer a few hours after I've completely given up consciously. ;-)
I was trying to achieve the "pascalizing" with the following XLST function call:
<xsl:value-of select="fn:replace(#name,'_(\w{1})','\U$1')"/>
Unfortunately the processor throws the error message "Invalid replacement string in replace():
\ character must be followed by \ or $"
the problem is the \U modifier which is supposed to do the uppercase conversion of the matched pattern. If I change it to
<xsl:value-of select="fn:replace(#name,'_(\w{1})','\\U$1')"/>
the output string contains the sequence '\U' because it is now esacped - but I don't want to escape it, I want it do be effective ;-) . I did test
<xsl:value-of select="fn:replace(#name,'_(\w{1})','$1')"/>
(without converting the match to uppercase) and that works fine. But of course it does no uppercasing, just removes underscores and replaces the letter after the underscore by itself instead of capitalizing it. Am I doing something wrong here or is the \U modifier simply not supported in the regex implementation of my XSLT processor?
Thanks to Dimitre, I was able to get most of the way there. When running my strings through the Pascalize template, the bit after the last '_' was cut off. There's probably a cleaner way of doing it, but here's the code I used:
<xsl:template name="Pascalize">
<xsl:param name="pText"/>
<xsl:if test="$pText">
<xsl:value-of select="translate(substring($pText,1,1), $vLower, $vUpper)"/>
<xsl:value-of select="substring-before(substring($pText,2), '_')"/>
<xsl:call-template name="Pascalize">
<xsl:with-param name="pText" select="substring-after(substring($pText,2), '_')"/>
</xsl:call-template>
<xsl:call-template name="GrabLastPart">
<xsl:with-param name="pText" select="$pText"/>
</xsl:call-template>
</xsl:if>
</xsl:template>
<xsl:template name="GrabLastPart">
<xsl:param name="pText"/>
<xsl:choose>
<xsl:when test="contains($pText, '_')">
<xsl:call-template name="GrabLastPart">
<xsl:with-param name="pText" expr="substring-after($pText, '_')"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="substring($pText, 2)"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>

Resources