xsl-list
[Top] [All Lists]

[xsl] Re: Anyone implemented a fuzzy matcher in XPath?

2013-01-30 05:15:41
"Costello, Roger L." <costello(_at_)mitre(_dot_)org> wrote:

Has anyone implemented a fuzzy matcher (approximate string
matcher [1]) in XPath?

I implemented one, based on pairwise string alignment, as an XSLT 1.0 template. 
 The compare-strings template takes as parameters two strings and returns a 
score between 0 and 1 representing the closeness of the match:

<?xml version="1.0" ?>

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform";
                              xmlns:str="http://exslt.org/strings";
                              extension-element-prefixes="str">

  <xsl:variable 
name="lower-case-letters">abcdefghijklmnopqrstuvwxyz</xsl:variable>
  <xsl:variable 
name="upper-case-letters">ABCDEFGHIJKLMNOPQRSTUVWXYZ</xsl:variable>

  <xsl:template name="compare-strings">
    <xsl:param name="string1"/>
    <xsl:param name="string2"/>

    <xsl:variable name="pairs1">
      <xsl:call-template name="get-word-letter-pairs">
        <xsl:with-param name="string" select="normalize-space(translate($string1, 
$lower-case-letters, $upper-case-letters))"/>
      </xsl:call-template>
    </xsl:variable>

    <xsl:variable name="pairs2">
      <xsl:call-template name="get-word-letter-pairs">
        <xsl:with-param name="string" select="normalize-space(translate($string2, 
$lower-case-letters, $upper-case-letters))"/>
      </xsl:call-template>
    </xsl:variable>

    <xsl:call-template name="compare-pairs">
      <xsl:with-param name="pairs1" select="$pairs1"/>
      <xsl:with-param name="pairs2" select="$pairs2"/>
    </xsl:call-template>

  </xsl:template>

  <xsl:template name="compare-pairs">
    <xsl:param name="pairs1"/>
    <xsl:param name="pairs2"/>

    <xsl:variable name="num-pairs1" select="string-length($pairs1) div 3"/>
    <xsl:variable name="num-pairs2" select="string-length($pairs2) div 3"/>
    <xsl:variable name="union" select="$num-pairs1 + $num-pairs2"/>

    <xsl:variable name="intersection">
      <xsl:call-template name="intersect-remaining-pairs">
        <xsl:with-param name="pairs1" select="$pairs1"/>
        <xsl:with-param name="pairs2" select="$pairs2"/>
      </xsl:call-template>
    </xsl:variable>

    <xsl:value-of select="2.0 * $intersection div $union"/>

  </xsl:template>

  <xsl:template name="intersect-remaining-pairs">
    <xsl:param name="pairs1"/>
    <xsl:param name="pairs2"/>
    <xsl:param name="intersection">0</xsl:param>

    <xsl:variable name="pair" select="substring-before($pairs1, ' ')"/>
    <xsl:choose>
      <xsl:when test="$pair = ''">
        <xsl:value-of select="$intersection"/>
      </xsl:when>
      <xsl:when test="contains($pairs2, $pair)">
        <xsl:call-template name="intersect-remaining-pairs">
          <xsl:with-param name="pairs1" select="substring-after($pairs1, ' ')"/>
          <xsl:with-param name="pairs2" select="concat(substring-before($pairs2, 
$pair), substring-after($pairs2, concat($pair, ' ')))"/>
          <xsl:with-param name="intersection" select="$intersection + 1"/>
        </xsl:call-template>
      </xsl:when>
      <xsl:otherwise>
        <xsl:call-template name="intersect-remaining-pairs">
          <xsl:with-param name="pairs1" select="substring-after($pairs1, ' ')"/>
          <xsl:with-param name="pairs2" select="$pairs2"/>
          <xsl:with-param name="intersection" select="$intersection"/>
        </xsl:call-template>
      </xsl:otherwise>
    </xsl:choose>

  </xsl:template>

  <xsl:template name="get-word-letter-pairs">
    <xsl:param name="string"/>
    <xsl:param name="pairs"></xsl:param>

    <xsl:choose>
      <xsl:when test="$string = ''">
        <xsl:value-of select="$pairs"/>
      </xsl:when>
      <xsl:otherwise>
        <xsl:variable name="word">
          <xsl:choose>
            <xsl:when test="contains($string, ' ')">
              <xsl:value-of select="substring-before($string, ' ')"/>
            </xsl:when>
            <xsl:otherwise>
              <xsl:value-of select="$string"/>
            </xsl:otherwise>
          </xsl:choose>
        </xsl:variable>
        <xsl:variable name="letter-pairs">
          <xsl:call-template name="get-letter-pairs">
            <xsl:with-param name="word" select="$word"/>
          </xsl:call-template>
        </xsl:variable>
        <xsl:call-template name="get-word-letter-pairs">
          <xsl:with-param name="string" select="substring-after($string, ' ')"/>
          <xsl:with-param name="pairs" select="concat($pairs, $letter-pairs)"/>
        </xsl:call-template>
      </xsl:otherwise>
    </xsl:choose>
  </xsl:template>

  <xsl:template name="get-letter-pairs">
    <xsl:param name="word"/>
    <xsl:param name="pairs"></xsl:param>

    <xsl:choose>
      <xsl:when test="string-length($word) &lt; 2">
        <xsl:value-of select="$pairs"/>
      </xsl:when>
      <xsl:otherwise>
        <xsl:call-template name="get-letter-pairs">
          <xsl:with-param name="word" select="substring($word, 2, string-length($word) 
- 1)"/>
          <xsl:with-param name="pairs" select="concat($pairs, substring($word, 1, 2), ' 
')"/>
        </xsl:call-template>
      </xsl:otherwise>
    </xsl:choose>
  </xsl:template>

</xsl:stylesheet>

--
Roger L. Cauvin
"Smart product decisions"
Twitter: @rcauvin
http://blog.cauvin.org



--~------------------------------------------------------------------
XSL-List info and archive:  http://www.mulberrytech.com/xsl/xsl-list
To unsubscribe, go to: http://lists.mulberrytech.com/xsl-list/
or e-mail: <mailto:xsl-list-unsubscribe(_at_)lists(_dot_)mulberrytech(_dot_)com>
--~--

<Prev in Thread] Current Thread [Next in Thread>
  • [xsl] Re: Anyone implemented a fuzzy matcher in XPath?, Roger L. Cauvin <=