Hello All,
I slapped together a quick XSLt script to convert an XML document
with a simple flat structure into a postgres-optimized table-load script.
I had a schema document (XSD), so I used that to defined the transformation,
but I also used the assumption that the schema was a simple 2-level document.
The XML is a reasonable size .. 19MB.
My first attempt is a stylesheet which dynamically used the information
from the XSD to pull the fields from the XML. This was taking > 30 minutes
to run.
It seemed to work, though I only ran it on a few similarly formatted test
documents. (The smallest of which, I'm including below)
(I used sablotron... XSLT1.0.. xsltproc/libxslt was taking much longer)
In the second attempt, I converted that stylesheet to one which generated
a specific stylesheet for that XML, which did the transformation in 30 seconds.
(The results diff'ed to be identical, minus one newline).
In general, I lean towards 1-stage processes over 2-stage processes when
I don't see a real usefulness to the intermediate stage (ie. modularizing
functionality).
If anybody spots a way to optimize this first stylesheet, I'd appreciate it.
If the schema document I'm including below is too big or verbose, I could
find a way to cut the example down a bit... in general though, it's just
a flat space, like the XML example below it.
Thanks,
Brian Chrisman
(hopefully my long lines or other formatting irregularities don't cause a
huge problem here..)
-- xsd2insertgenerator.xsl
<xsl:stylesheet
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:od="urn:schemas-microsoft-com:officedata"
exclude-result-prefixes="xsd od"
version="1.0">
<xsl:output method="text" omit-xml-declaration="yes"/>
<!-- just imports the data, does not create the table.. etc -->
<xsl:variable name="schema" select="document('/www/MAD/MAD.xsd')"/>
<xsl:variable name="table"
select="$schema/xsd:schema/xsd:element[1]//@ref"/>
<xsl:variable name="element-list"
select="$schema/xsd:schema/xsd:element[(_at_)name=$schema/xsd:schema/xsd:element[1]//@ref]//xsd:element"/>
<!-- some fields may exist *only* in the schema document -->
<xsl:variable name="fields"
select="$schema/xsd:schema/xsd:element[(_at_)name=/xsd:schema/xsd:element[1]//@ref]//xsd:element"/>
<xsl:template match="/">
<xsl:value-of select="concat('COPY "', $table, '"
FROM stdin;
')"/>
<xsl:apply-templates select="/dataroot/*" mode="one-record"/>
<!-- Flat space imports only -->
<xsl:text>\.</xsl:text>
</xsl:template>
<xsl:template match="*" mode="one-record">
<xsl:variable name="record" select="."/>
<!-- Note: below I am looping over the entries in the XSD doc,
and looking up corresponding values in the XML record -->
<xsl:for-each select="$element-list">
<xsl:value-of select="$record/*[local-name(.) =
current()/@name]"/>
<xsl:if test="position() <
last()"><xsl:text>	</xsl:text></xsl:if>
</xsl:for-each>
<xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
-- xsd2insertgeneratorgenerator.xsl
<xsl:stylesheet
version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:gen="dummy-namespace-for-the-generated-xslt"
xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:od="urn:schemas-microsoft-com:officedata"
exclude-result-prefixes="xsd od xsl">
<xsl:output method="xml" indent="yes"/>
<xsl:namespace-alias stylesheet-prefix="gen" result-prefix="xsl"/>
<xsl:variable name="schema" select="document('/www/MAD/MAD.xsd')"/>
<xsl:variable name="table"
select="$schema/xsd:schema/xsd:element[1]//@ref"/>
<xsl:variable name="element-list"
select="$schema/xsd:schema/xsd:element[(_at_)name=$schema/xsd:schema/xsd:element[1]//@ref]//xsd:element"/>
<xsl:template match="/">
<gen:stylesheet version="1.0">
<gen:output method="txt" omit-xml-declaration="yes"/>
<gen:template match="/">
<gen:value-of select="concat('COPY "',
'{$table}', '" FROM stdin;
')"/>
<gen:apply-templates
select="/dataroot/{$table}"/> <!-- Flat space imports only -->
<gen:text>\.</gen:text>
<gen:for-each select="{$table}">
<xsl:for-each select="$element-list">
<gen:value-of
select="{(_at_)name}"/>
<xsl:if test="position() <
last()">
<gen:text><xsl:value-of
select="'	'"/></gen:text>
</xsl:if>
</xsl:for-each>
<gen:text><xsl:value-of
select="'
'"/></gen:text>
</gen:for-each>
</gen:template>
<!--
<gen:template match="{$table}">
<xsl:for-each select="$element-list">
<gen:value-of select="{(_at_)name}"/>
<xsl:if test="position() < last()">
<gen:text><xsl:value-of
select="'	'"/></gen:text>
</xsl:if>
</xsl:for-each>
<gen:text><xsl:value-of
select="'
'"/></gen:text>
</gen:template>
-->
</gen:stylesheet>
</xsl:template>
</xsl:stylesheet>
-- MAD.xsd
<?xml version="1.0" encoding="UTF-8"?>
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:od="urn:schemas-microsoft-com:officedata">
<xsd:element name="dataroot">
<xsd:complexType>
<xsd:sequence>
<xsd:element ref="MAD" minOccurs="0" maxOccurs="unbounded"/>
</xsd:sequence>
<xsd:attribute name="generated" type="xsd:dateTime"/>
</xsd:complexType>
</xsd:element>
<xsd:element name="MAD">
<xsd:annotation>
<xsd:appinfo/>
</xsd:annotation>
<xsd:complexType>
<xsd:sequence>
<xsd:element name="ADDRESS_ID" minOccurs="1" od:jetType="double"
od:sqlSType="float" od:nonNullable="yes" type="xsd:double"/>
<xsd:element name="APN" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="14"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="PREFIX_TYPE" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="15"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="PREFIX_DIR" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="5"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="ST_NUMBER" minOccurs="0" od:jetType="double"
od:sqlSType="float" type="xsd:double"/>
<xsd:element name="ST_NAME" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="30"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="ST_TYPE" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="10"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="SUFFIX_DIR" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="5"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="CITY" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="25"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="ZIP" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="10"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="LONG_ADDRESS" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="100"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="ADDR_FRACTION" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="5"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="UNIT_DESIGNATOR" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="10"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="UNIT" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="10"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="FLOOR" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="5"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="BUILDING" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="30"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="ADDRESS_TYPE" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="10"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="ADDRESS_STATUS" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="20"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="ADDRESS_USE" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="25"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="PRIVATE" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="10"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="INCORPORATED" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="10"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="X_COORD" minOccurs="0" od:jetType="double"
od:sqlSType="float" type="xsd:double"/>
<xsd:element name="Y_COORD" minOccurs="0" od:jetType="double"
od:sqlSType="float" type="xsd:double"/>
<xsd:element name="DATE_ENTERED" minOccurs="0" od:jetType="datetime"
od:sqlSType="datetime" type="xsd:dateTime"/>
<xsd:element name="DATE_UPDATED" minOccurs="0" od:jetType="datetime"
od:sqlSType="datetime" type="xsd:dateTime"/>
<xsd:element name="DATE_RETIRED" minOccurs="0" od:jetType="datetime"
od:sqlSType="datetime" type="xsd:dateTime"/>
<xsd:element name="SOURCE" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="30"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="BUS_LIC_FLAG" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="10"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="SERC_REQ_FLAG" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="15"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="PROJECT_FLAG" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="15"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="PERMIT_FLAG" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="15"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="ZIP4_FLAG" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="1"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element name="COORD_SOURCE" minOccurs="0" od:jetType="text"
od:sqlSType="nvarchar">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="10"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
</xsd:sequence>
</xsd:complexType>
</xsd:element>
</xsd:schema>
-- foo.xml (test data)
<?xml version="1.0" encoding="UTF-8"?>
<dataroot xmlns:od="urn:schemas-microsoft-com:officedata"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="MAD.xsd" generated="2005-04-29T15:13:49">
<MAD>
<ADDRESS_ID>1</ADDRESS_ID>
<APN>22400900350000</APN>
<ST_NUMBER>7453</ST_NUMBER>
<ST_NAME>TIARA</ST_NAME>
<ST_TYPE>WAY</ST_TYPE>
<CITY>ORANGEVALE</CITY>
<ZIP>95662</ZIP>
<LONG_ADDRESS>7453 TIARA WAY B</LONG_ADDRESS>
<UNIT>B</UNIT>
<X_COORD>6768867.42298</X_COORD>
<Y_COORD>2018154.73453</Y_COORD>
<DATE_ENTERED>2005-04-19T00:00:00</DATE_ENTERED>
<SOURCE>GISDROP_20050225</SOURCE>
<BUS_LIC_FLAG>N</BUS_LIC_FLAG>
<SERC_REQ_FLAG>N</SERC_REQ_FLAG>
<PROJECT_FLAG>N</PROJECT_FLAG>
<PERMIT_FLAG>Y</PERMIT_FLAG>
<ZIP4_FLAG>N</ZIP4_FLAG>
<COORD_SOURCE>CENTROID</COORD_SOURCE>
</MAD>
<MAD>
<ADDRESS_ID>8</ADDRESS_ID>
<APN>24301920130000</APN>
<ST_NUMBER>6054</ST_NUMBER>
<ST_NAME>SUNRISE</ST_NAME>
<ST_TYPE>MALL</ST_TYPE>
<CITY>CITRUS HEIGHTS</CITY>
<ZIP>95610</ZIP>
<LONG_ADDRESS>6054 SUNRISE MALL B7</LONG_ADDRESS>
<UNIT>B7</UNIT>
<X_COORD>6770356.0616</X_COORD>
<Y_COORD>2007415.14451</Y_COORD>
<DATE_ENTERED>2005-04-19T00:00:00</DATE_ENTERED>
<SOURCE>GISDROP_20050225</SOURCE>
<BUS_LIC_FLAG>N</BUS_LIC_FLAG>
<SERC_REQ_FLAG>N</SERC_REQ_FLAG>
<PROJECT_FLAG>N</PROJECT_FLAG>
<PERMIT_FLAG>Y</PERMIT_FLAG>
<ZIP4_FLAG>Y</ZIP4_FLAG>
<COORD_SOURCE>CENTROID</COORD_SOURCE>
</MAD>
<MAD>
<ADDRESS_ID>9</ADDRESS_ID>
<APN>24301500230000</APN>
<ST_NUMBER>7669</ST_NUMBER>
<ST_NAME>GREENBACK</ST_NAME>
<ST_TYPE>LN</ST_TYPE>
<ZIP>0</ZIP>
<LONG_ADDRESS>7669 GREENBACK LN 2002</LONG_ADDRESS>
<UNIT>2002</UNIT>
<X_COORD>6766774.11111</X_COORD>
<Y_COORD>2010059.02536</Y_COORD>
<DATE_ENTERED>2005-04-19T00:00:00</DATE_ENTERED>
<SOURCE>GISDROP_20050225</SOURCE>
<BUS_LIC_FLAG>N</BUS_LIC_FLAG>
<SERC_REQ_FLAG>N</SERC_REQ_FLAG>
<PROJECT_FLAG>N</PROJECT_FLAG>
<PERMIT_FLAG>Y</PERMIT_FLAG>
<ZIP4_FLAG>N</ZIP4_FLAG>
<COORD_SOURCE>CENTROID</COORD_SOURCE>
</MAD>
</dataroot>
--~------------------------------------------------------------------
XSL-List info and archive: http://www.mulberrytech.com/xsl/xsl-list
To unsubscribe, go to: http://lists.mulberrytech.com/xsl-list/
or e-mail: <mailto:xsl-list-unsubscribe(_at_)lists(_dot_)mulberrytech(_dot_)com>
--~--