Python parses BeautifulSoup of HTML

  • 2020-04-02 09:19:19
  • OfStack

# coding=utf-8 
from BeautifulSoup import BeautifulSoup, Tag, NavigableString 
from SentenceSpliter import SentenceSpliter 
from os.path import basename,dirname,isdir,isfile 
from os import makedirs 
from shutil import copyfile 
import io 
import time 
import re 

class build_tpl: 
    def __init__(self,parse_file,build_tpl_name,cp_pic_dir,show_pic_dir,js_path,set_lang=2052): 
        ''' Parameter description: resolve the file name, template name, save the picture path, picture display path, js Path, current language (clause usage) ''' 

        # Gets the directory path to the parsed file  
        if len(dirname(parse_file))>1: 
            self.cur_dir = dirname(parse_file)+"/"; 
            self.cur_dir ="./"; 

        # Creates the template file name  
        self.build_tpl_name = build_tpl_name; 
        # The picture cp To the directory  
        self.cp_pic_dir = cp_pic_dir; 
        # through http A directory showing pictures  
        self.show_pic_dir = show_pic_dir; 
        # loading js The path of the  
        self.js_path = js_path; 

        # Other section of the group  
        self.get_text_arr = []; 
        # Array of current image names  
        self.cur_pic_arr = []; 

        # Parse the file   achieve soup  resources  
        self.soup = self.get_soup(parse_file); 
        # achieve html In a document, a segment document  
        self.get_text_arr = self.soup.body.findAll(text=lambda(x): len(x.strip()) > 0); 
        # In words of  
        self.get_sentence_arr = self.parse_text(self.get_text_arr,set_lang); 
        # Get the replacement array  
        self.replace_list = self.get_replace_list(self.get_text_arr,set_lang); 
        # Get an array of images  
        self.cur_pic_arr = self.soup.findAll('img'); 


    # Save the phrase to the file  
    def save_data_file(self): 
        file_name = self.build_tpl_name+".data"; 
    # Make phrases  
    def get_data(self): 
        return self.get_sentence_arr; 
    # The array is written to the document  
    def write_file_by_list(self,file_name,write_arr): 
    # The string is written to the document  
    def write_file(self,file_name,file_contents): 
    # Set up pictures hash directory  
    def get_pic_hash(self): 
        return time.strftime("%Y/%m/%d/"); 
    # Create template files  
    def builder(self): 
        # The word that failed to be substituted  
        bug_msg = []; 
        # Replace the content template  
        for i in range(len(self.get_text_arr)): 
            # replace  
            rep_str = "$rep_arr[{0}]".format(i); 
            except AttributeError: 

        # Obtain images hash The path  
        hash_dir = self.get_pic_hash(); 
        # Construct a display image path  
        show_pic_dir = self.show_pic_dir+hash_dir; 
        # Construct the image save path  
        cp_pic_dir = self.cp_pic_dir+hash_dir; 

        # Determines if the directory to save the image exists   Nonexistent establishment  
        if not isdir(cp_pic_dir): 

        for pic_name in self.cur_pic_arr: 
            # Replace the image path  
            old_pic_src = pic_name['src']; 
            pic_name['src'] = show_pic_dir+old_pic_src; 
            # Copy pictures  
            cp_src_file = self.cur_dir+old_pic_src; 
            cp_dis_file = cp_pic_dir+old_pic_src; 

        # To establish bug Document of information  

        # add js 
        tag = Tag(self.soup,"script"); 
        tag['type'] = "text/javascript"; 
        tag['src'] =self.js_path+"jquery.js"; 

        tag2 = Tag(self.soup,"script"); 
        tag2['type'] = "text/javascript"; 
        tag2['src'] =self.js_path+"init.js"; 


        # Establish a template  
    # Get substituted html file      
    def get_replace_html(self,rep_id,rep_data=""): 
         Parameter description: replace id , replace the content (for empty use template mode replacement)  
        if len(rep_data) > 0 : 
            rep_str = rep_data; 
            rep_str = "$rep_arr[{0}]".format(rep_id); 
        return "<span sty="data" id="rep_"+str(rep_id)+"">"+rep_str+"</span>"; 
    # Get the replacement array  
    def get_replace_list(self,text_arr,set_lang): 
        Sp = SentenceSpliter(); 
        temp_sentence = []; 
        jump_i = 0; 
        for text in text_arr: 
            SList = Sp.Split(text); 
            replace_temp = ""; 
            if SList != None: 
                for item in SList: 
                    replace_temp = replace_temp+self.get_replace_html(jump_i,item); 
                replace_temp = self.get_replace_html(jump_i,text); 
        return temp_sentence; 
    # clauses  
    def parse_text(self,text_arr,set_lang): 
        Sp = SentenceSpliter(); 
        temp_sentence = []; 
        for text in text_arr: 
            SList = Sp.Split(text); 
            if SList != None: 
                for item in SList: 

        return temp_sentence; 

    # Get a parse resource  
    def get_soup(self,parse_file): 
            doc = file.readall(); 
        except IOError: 
            print 'ERROR: %s file not found!' %parse_file; 
            return False; 
        # Start parsing html The document  
        return BeautifulSoup(''.join(doc)); 

if __name__ == "__main__": 
    from sys import argv, exit; 

    if len(argv) < 3: 
        print "USAGE: python %s <input-file> <output-file>" % argv[0] 

    if not isfile(argv[1]): 
        print "no such input file: %s" % argv[1] 

    paser_file = argv[1];#"html/testpic.html"; 
    tpl_file = argv[2]; 
    save_pic_path = argv[3]; 
    show_pic_path = argv[4]; 
    load_js_path = argv[5]; 
    # Parsing begins   Set the parse file, template name, image save path, image display path  
    so = build_tpl(paser_file,tpl_file,save_pic_path,show_pic_path,load_js_path); 
    # Establish a template  
    # Save the pairs of clauses  

Related articles: