"Costello, Roger L." <costello(_at_)mitre(_dot_)org> wrote:
Has anyone implemented a fuzzy matcher (approximate string
matcher [1]) in XPath?
I implemented one, based on pairwise string alignment, as an XSLT 1.0 template.
The compare-strings template takes as parameters two strings and returns a
score between 0 and 1 representing the closeness of the match:
<?xml version="1.0" ?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:str="http://exslt.org/strings"
extension-element-prefixes="str">
<xsl:variable
name="lower-case-letters">abcdefghijklmnopqrstuvwxyz</xsl:variable>
<xsl:variable
name="upper-case-letters">ABCDEFGHIJKLMNOPQRSTUVWXYZ</xsl:variable>
<xsl:template name="compare-strings">
<xsl:param name="string1"/>
<xsl:param name="string2"/>
<xsl:variable name="pairs1">
<xsl:call-template name="get-word-letter-pairs">
<xsl:with-param name="string" select="normalize-space(translate($string1,
$lower-case-letters, $upper-case-letters))"/>
</xsl:call-template>
</xsl:variable>
<xsl:variable name="pairs2">
<xsl:call-template name="get-word-letter-pairs">
<xsl:with-param name="string" select="normalize-space(translate($string2,
$lower-case-letters, $upper-case-letters))"/>
</xsl:call-template>
</xsl:variable>
<xsl:call-template name="compare-pairs">
<xsl:with-param name="pairs1" select="$pairs1"/>
<xsl:with-param name="pairs2" select="$pairs2"/>
</xsl:call-template>
</xsl:template>
<xsl:template name="compare-pairs">
<xsl:param name="pairs1"/>
<xsl:param name="pairs2"/>
<xsl:variable name="num-pairs1" select="string-length($pairs1) div 3"/>
<xsl:variable name="num-pairs2" select="string-length($pairs2) div 3"/>
<xsl:variable name="union" select="$num-pairs1 + $num-pairs2"/>
<xsl:variable name="intersection">
<xsl:call-template name="intersect-remaining-pairs">
<xsl:with-param name="pairs1" select="$pairs1"/>
<xsl:with-param name="pairs2" select="$pairs2"/>
</xsl:call-template>
</xsl:variable>
<xsl:value-of select="2.0 * $intersection div $union"/>
</xsl:template>
<xsl:template name="intersect-remaining-pairs">
<xsl:param name="pairs1"/>
<xsl:param name="pairs2"/>
<xsl:param name="intersection">0</xsl:param>
<xsl:variable name="pair" select="substring-before($pairs1, ' ')"/>
<xsl:choose>
<xsl:when test="$pair = ''">
<xsl:value-of select="$intersection"/>
</xsl:when>
<xsl:when test="contains($pairs2, $pair)">
<xsl:call-template name="intersect-remaining-pairs">
<xsl:with-param name="pairs1" select="substring-after($pairs1, ' ')"/>
<xsl:with-param name="pairs2" select="concat(substring-before($pairs2,
$pair), substring-after($pairs2, concat($pair, ' ')))"/>
<xsl:with-param name="intersection" select="$intersection + 1"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:call-template name="intersect-remaining-pairs">
<xsl:with-param name="pairs1" select="substring-after($pairs1, ' ')"/>
<xsl:with-param name="pairs2" select="$pairs2"/>
<xsl:with-param name="intersection" select="$intersection"/>
</xsl:call-template>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template name="get-word-letter-pairs">
<xsl:param name="string"/>
<xsl:param name="pairs"></xsl:param>
<xsl:choose>
<xsl:when test="$string = ''">
<xsl:value-of select="$pairs"/>
</xsl:when>
<xsl:otherwise>
<xsl:variable name="word">
<xsl:choose>
<xsl:when test="contains($string, ' ')">
<xsl:value-of select="substring-before($string, ' ')"/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$string"/>
</xsl:otherwise>
</xsl:choose>
</xsl:variable>
<xsl:variable name="letter-pairs">
<xsl:call-template name="get-letter-pairs">
<xsl:with-param name="word" select="$word"/>
</xsl:call-template>
</xsl:variable>
<xsl:call-template name="get-word-letter-pairs">
<xsl:with-param name="string" select="substring-after($string, ' ')"/>
<xsl:with-param name="pairs" select="concat($pairs, $letter-pairs)"/>
</xsl:call-template>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template name="get-letter-pairs">
<xsl:param name="word"/>
<xsl:param name="pairs"></xsl:param>
<xsl:choose>
<xsl:when test="string-length($word) < 2">
<xsl:value-of select="$pairs"/>
</xsl:when>
<xsl:otherwise>
<xsl:call-template name="get-letter-pairs">
<xsl:with-param name="word" select="substring($word, 2, string-length($word)
- 1)"/>
<xsl:with-param name="pairs" select="concat($pairs, substring($word, 1, 2), '
')"/>
</xsl:call-template>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
</xsl:stylesheet>
--
Roger L. Cauvin
"Smart product decisions"
Twitter: @rcauvin
http://blog.cauvin.org
--~------------------------------------------------------------------
XSL-List info and archive: http://www.mulberrytech.com/xsl/xsl-list
To unsubscribe, go to: http://lists.mulberrytech.com/xsl-list/
or e-mail: <mailto:xsl-list-unsubscribe(_at_)lists(_dot_)mulberrytech(_dot_)com>
--~--