Use python to parse the XML into corresponding HTML samples to share

  • 2020-04-02 13:34:42
  • OfStack

SAX parses dd.xml into HTML. Of course, if you have the corresponding XSL file for the XML, you can directly convert it to HTML using libxml2.


#!/usr/bin/env python 
# -*- coding: utf-8 -*-
#---------------------------------------
#    Application: XML The parser 
#    Version: 01.0
#    The author: mupeng
#    Date: 2013-12-18
#    Language: Python 2.7
#    Function: xml It's going to be corresponding html
#    Comments: this program is used xml.sax The module parse Function analysis XML And generate the event 
#    inheritance ContentHandler Override its event handler 
#   Dispatcher It is mainly used to distribute the start and end events of corresponding labels 
#---------------------------------------
from xml.sax.handler import ContentHandler
from xml.sax import parse
class Dispatcher:
    def dispatch(self, prefix, name, attrs=None):
        mname = prefix + name.capitalize()
        dname = 'default' + prefix.capitalize()
        method = getattr(self, mname, None)
        if callable(method): args = ()
        else:
            method = getattr(self, dname, None)
            #args = name
        #if prefix == 'start': args += attrs
        if callable(method): method()
    def startElement(self, name, attrs):
        self.dispatch('start', name, attrs)
    def endElement(self, name):
        self.dispatch('end', name)
class Website(Dispatcher, ContentHandler):
    def __init__(self):
        self.fout = open('ddt_SAX.html', 'w')
        self.imagein = False
        self.desflag = False
        self.item = False
        self.title = ''
        self.link = ''
        self.guid = ''
        self.url = ''
        self.pubdate = ''
        self.description = ''
        self.temp = ''
        self.prx = ''
    def startChannel(self):

        self.fout.write('''<html>n<head>n<title> RSS-''')

    def endChannel(self):
       self.fout.write('''
                    <tr><td height="20"></td></tr>
                    </table>
                    </center>
                    <script>
    function  GetTimeDiff(str)
    {
     if(str == '')
     {
      return '';
     }
     var pubDate = new Date(str);
     var nowDate = new Date();
     var diffMilSeconds = nowDate.valueOf()-pubDate.valueOf();
     var days = diffMilSeconds/86400000;
     days = parseInt(days);
     diffMilSeconds = diffMilSeconds-(days*86400000);
     var hours = diffMilSeconds/3600000;
     hours = parseInt(hours);
     diffMilSeconds = diffMilSeconds-(hours*3600000);
     var minutes = diffMilSeconds/60000;
     minutes = parseInt(minutes);
     diffMilSeconds = diffMilSeconds-(minutes*60000);
     var seconds = diffMilSeconds/1000;
     seconds = parseInt(seconds);

     var returnStr = " + / - ¾© ・ ¢²¼Ê Plus or minus ¼ä£º" + pubDate.toLocaleString();
     if(days > 0)
     {
      returnStr = returnStr + " £ ¨ ¾ a. ÀëÏÖÔÚ" + days + "Ì i. " + hours + "Ð¡Ê Plus or minus " + minutes + " ・ ÖÖÓ£©";
     }
     else if (hours > 0)
     {
      returnStr = returnStr + " £ ¨ ¾ a. ÀëÏÖÔÚ" + hours + "Ð¡Ê Plus or minus " + minutes + " ・ ÖÖÓ£©";
     }
     else if (minutes > 0)
     {
      returnStr = returnStr + " £ ¨ ¾ a. ÀëÏÖÔÚ" + minutes + " ・ ÖÖÓ£©";
     }
     return returnStr;
    }
    function GetSpanText()
    {
     var pubDate;
     var pubDateArray;
     var spanArray = document.getElementsByTagName("span");
     for(var i = 0; i < spanArray.length; i++)
     {
      pubDate = spanArray[i].innerHTML;
      document.getElementsByTagName("span")[i].innerHTML = GetTimeDiff(pubDate);   
     }
    }
    GetSpanText();
   </script>
                </body>
                </html>
                ''')
       self.fout.close()
    def characters(self, chars):
        if chars.strip():
            #chars = chars.strip()
            self.temp += chars
            #print self.temp

       
    def startTitle(self):

        if self.item:
            self.fout.write('''
                        <tr bgcolor="#eeeeee">n<td style="padding-top:5px;padding-left:5px;" height="30">n<B>
                    ''')

    def endTitle(self):

        if not self.imagein and not self.item:
            self.title = self.temp
            self.temp = ''
            self.fout.write(self.title.encode('gb2312'))

            #self.title = self.temp
            self.fout.write('''
                </title>n</head>n<body>n<center>n
                <script>n
                        function copyLink()
                        {
                                clipboardData.setData("Text",window.location.href);
                                alert("RSSÁ´½ÓÒѾ­¸´ÖƵ½¼ôÌ U ° å");
                        }
                        function subscibeLink()
                        {
                                var str = window.location.pathname;
                                while(str.match(/^//))
                                {
                                        str = str.replace(/^//,"");
                                }
                                window.open("http://rss.sina.com.cn/my_sina_web_rss_news.html?url=" + str,"_self");
                        }
                        </script>n
                <table width="750" cellpadding="0" cellspacing="0">n
                <tr>n
                <td align="right" style="padding-right:15px;" valign="bottom">n
            ''')

        if self.item:
            self.title = self.temp
            self.temp = ''
            self.fout.write(self.title.encode('gb2312'))
            self.fout.write('''
                        </B>
                        </td>
                        </tr>
                        <tr bgcolor="#eeeeee">
                        <td style="padding-left:5px;">
                        ''')
    def startImage(self):
        self.imagein = True
    def endImage(self):
        self.imagein = False

    def startLink(self):
        if self.imagein:
            self.fout.write('''<A href=" ''')

            
    def endLink(self):
        self.link = self.temp
        self.temp = ''
        if self.imagein:
            self.fout.write(self.link.encode('gb2312'))
            self.fout.write('''" target="_blank">n ''')
        elif self.item:
            #self.link = self.temp
            pass
        else:
            self.fout.write(self.link)
            self.fout.write(''' " target="
      _blank
     "> ''')
            self.fout.write(self.title.encode('gb2312'))
            self.fout.write(''' </A></B></td>
                            </tr>
                            <tr><td colspan="2" align="center">
                            ''')
            self.fout.write(self.description.encode('gb2312'))
            self.fout.write('''
                        </td></tr>
                        <tr style="font-size:12px;" bgcolor="#eeeeff"><td colspan="2" style="font-size:14px;padding-top:5px;padding-bottom:5px;"><b><a href="javascript:copyLink();">¸´ÖÆ´ËÒ³Á´½Ó</a>                <a href="javascript:subscibeLink();">ÎÒҪǶÈë¸ÃÐÂÎÅÁÐ Plus or minus the I µ½ÎÒµÄÒ³Ãæ£ ¨ ¼ o µ¥¡¢¿ i. ËÙ¡¢ÊµÊ Plus or minus ¡¢Ãâ ・ Ñ£©</a></b></td></tr>
                        </table>
                        <table width="750" cellpadding="0" cellspacing="0">
                            ''')
    def startUrl(self):
        if self.imagein:
            self.fout.write('''<IMG src=" ''')
    def endUrl(self):
        self.url = self.temp
        self.temp = ''
        if self.imagein:
            self.fout.write(self.url.encode('gb2312'))
            self.fout.write('''" border="0">n
                            </A>
                            </td>
                            <td align="left" valign="bottom" style="padding-bottom:8px;"><B><A href="
                            ''')
        if self.item:
            #self.url = self.temp
            pass
    def defaultStart(self):
        pass
    def defaultEnd(self):
        self.temp = ''
    def startDescription(self):
        pass
    def endDescription(self):
        self.description = self.temp
        self.temp = ''
        if self.item:
            #self.fout.write('¡¡¡¡')
            self.fout.write(self.description.encode('gb2312'))

    def endGuid(self):
        self.guid = self.temp
    def endPubdate(self):
        if not self.temp.startswith('http'):
         self.pubdate = self.temp
         self.temp = ''
        else:
            self.pubdate = ''
    def startItem(self):
        self.item = True
    def endItem(self):
        self.item = False
        self.fout.write('''
                            </td>
                            </tr>
                            <tr bgcolor="#eeeeee">
                            <td style="padding-top:5px;padding-left:5px;">
                            <A href="''')
        self.fout.write(self.link)
        self.fout.write(''' " target="_blank"> ''')
        self.fout.write(self.guid)
        self.fout.write('''
                        </A>
                        </td>
                        </tr>
                        <tr bgcolor="#eeeeee">
                        <td style="padding-top:5px;padding-left:5px;padding-bottom:5px;"><span>''')
        self.fout.write(self.pubdate)
        self.fout.write('''</span></td>
                        </tr>
                        <tr height="10"><td></td></tr>''')
# Program entrance 
if __name__ == '__main__':
    parse('ddt.xml', Website())


Related articles: