xsl-list
[Top] [All Lists]

Finding "unknown" character references

2005-11-22 05:53:21
Hi,

I need to check for "unknown" character references within my XML files.
All valid character references are stored in another XML file
("invalid_chars.xml") :

<?xml version="1.0" encoding="ISO-8859-1"?>
<chars>
   <char>&#8222;</char>
   <char>&#8218;</char>
   <char>&#353;</char>
   ...
</chars>

Everytime a special character, which is unknown to this reference file,
is encountered, 
it should be outputted.


What I've come up with is:

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="2.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform";>
    <xsl:output method="xml" encoding="ISO-8859-1" indent="yes"/>
        
    <xsl:template name="init">
        <xsl:variable name="source" select="'../dirlist.txt'"/>
        <xsl:variable name="encoding" select="'iso-8859-1'"/>
        <xsl:variable name="src">
            <errors>
               <xsl:for-each select="tokenize(unparsed-text($source,
$encoding), '\r?\n')">
                <file>
                    <xsl:value-of select="."/>
                </file>
                </xsl:for-each>
             </errors>
        </xsl:variable>
        <xsl:result-document href="invalid_chars.xml">
            <errors>
                 <xsl:apply-templates select="$src//file[text()]"/>
             </errors>
        </xsl:result-document>
      </xsl:template>

      <xsl:template match="file">
        <xsl:analyze-string select="."
regex="^([0-9]{{2}}.[0-9]{{2}}.[0-9]{{4}})\p{{Zs}}+([0-9]{{2}}:[0-9]{{2}
})\p{{Zs}}+([0-9.]+)(.*)$" flags="ix">
              <xsl:matching-substring>
                    <xsl:variable name="docpath"
select="normalize-space(regex-group(4))"/>
                     <xsl:variable name="errors" as="element()*">
                      
                        <!-- Here my problems begin -->
                
                        <xsl:if
test="document($docpath)//text()[contains(., '&amp;')]">
                              <xsl:analyze-string select="."
regex="&amp;[#a-zA-Z0-9]+" flags="i">
                                <xsl:matching-substring>
                                     <xsl:if
test="document('valid_cahrs.xml')//char=concat(regex-group(0), ';')">
                                        <file>
                                            <char>
                                                 <xsl:value-of
select="."/>
                                                 <xsl:text>;</xsl:text>
                                            </char>
                                        </file>
                                      </xsl:if>
                                </xsl:matching-substring>
                                </xsl:analyze-string>
                          </xsl:if>
                      </xsl:variable>

                      <xsl:for-each-group select="$errors"
group-by="@name">
                        <file name="{current-grouping-key()}">
                             <xsl:for-each select="current-group()">
                                <xsl:copy-of select="*"/>
                              </xsl:for-each>
                        </file>
                       </xsl:for-each-group>
                </xsl:matching-substring>
                <xsl:non-matching-substring/>
           </xsl:analyze-string>
       </xsl:template>
</xsl:stylesheet>


Example XML:

<?xml version="1.0" encoding="ISO-8859-1"?>
<root>
     <absatz> &#8222;dyplom licencjata piel&#281;gniarstwa&#8220;
</absatz>
</root>


How can I check, if a text node contains a character unknown to my
valid_chars.xml?
Do you have any ideas, how I can make my XSLT work?
Thank you very much.

wbr,
Roman

--~------------------------------------------------------------------
XSL-List info and archive:  http://www.mulberrytech.com/xsl/xsl-list
To unsubscribe, go to: http://lists.mulberrytech.com/xsl-list/
or e-mail: <mailto:xsl-list-unsubscribe(_at_)lists(_dot_)mulberrytech(_dot_)com>
--~--



<Prev in Thread] Current Thread [Next in Thread>