Python parses BeautifulSoup of HTML

  • 2020-04-02 09:19:19
  • OfStack


# coding=utf-8 
from BeautifulSoup import BeautifulSoup, Tag, NavigableString 
from SentenceSpliter import SentenceSpliter 
from os.path import basename,dirname,isdir,isfile 
from os import makedirs 
from shutil import copyfile 
import io 
import time 
import re 

class build_tpl: 
    def __init__(self,parse_file,build_tpl_name,cp_pic_dir,show_pic_dir,js_path,set_lang=2052): 
        ''' Parameter description: resolve the file name, template name, save the picture path, picture display path, js Path, current language (clause usage) ''' 

        # Gets the directory path to the parsed file  
        if len(dirname(parse_file))>1: 
            self.cur_dir = dirname(parse_file)+"/"; 
        else: 
            self.cur_dir ="./"; 

        # Creates the template file name  
        self.build_tpl_name = build_tpl_name; 
        # The picture cp To the directory  
        self.cp_pic_dir = cp_pic_dir; 
        # through http A directory showing pictures  
        self.show_pic_dir = show_pic_dir; 
        # loading js The path of the  
        self.js_path = js_path; 

        # Other section of the group  
        self.get_text_arr = []; 
        # Array of current image names  
        self.cur_pic_arr = []; 

        # Parse the file   achieve soup  resources  
        self.soup = self.get_soup(parse_file); 
        # achieve html In a document, a segment document  
        self.get_text_arr = self.soup.body.findAll(text=lambda(x): len(x.strip()) > 0); 
        # In words of  
        self.get_sentence_arr = self.parse_text(self.get_text_arr,set_lang); 
        # Get the replacement array  
        self.replace_list = self.get_replace_list(self.get_text_arr,set_lang); 
        # Get an array of images  
        self.cur_pic_arr = self.soup.findAll('img'); 

        #self.write_file_by_list("no.txt",self.get_text_arr); 
        #self.write_file_by_list("yes.txt",self.get_sentence_arr); 

    # Save the phrase to the file  
    def save_data_file(self): 
        file_name = self.build_tpl_name+".data"; 
        self.write_file_by_list(file_name,self.get_data()); 
    # Make phrases  
    def get_data(self): 
        return self.get_sentence_arr; 
    # The array is written to the document  
    def write_file_by_list(self,file_name,write_arr): 
        file=io.FileIO(file_name,"w"); 
        file.write(('n'.join(write_arr)).encode('utf-8')); 
        file.close(); 
    # The string is written to the document  
    def write_file(self,file_name,file_contents): 
        file=io.FileIO(file_name,"w"); 
        file.write(file_contents.encode('utf-8')); 
        file.close(); 
    # Set up pictures hash directory  
    def get_pic_hash(self): 
        return time.strftime("%Y/%m/%d/"); 
    # Create template files  
    def builder(self): 
        # The word that failed to be substituted  
        bug_msg = []; 
        # Replace the content template  
        for i in range(len(self.get_text_arr)): 
            # replace  
            rep_str = "$rep_arr[{0}]".format(i); 
            try: 
                self.soup.body.find(text=self.get_text_arr[i]).replaceWith(self.replace_list[i]); 
            except AttributeError: 
                bug_msg.append(self.get_text_arr[i]); 

        # Obtain images hash The path  
        hash_dir = self.get_pic_hash(); 
        # Construct a display image path  
        show_pic_dir = self.show_pic_dir+hash_dir; 
        # Construct the image save path  
        cp_pic_dir = self.cp_pic_dir+hash_dir; 

        # Determines if the directory to save the image exists   Nonexistent establishment  
        if not isdir(cp_pic_dir): 
            makedirs(cp_pic_dir); 

        for pic_name in self.cur_pic_arr: 
            # Replace the image path  
            old_pic_src = pic_name['src']; 
            pic_name['src'] = show_pic_dir+old_pic_src; 
            # Copy pictures  
            cp_src_file = self.cur_dir+old_pic_src; 
            cp_dis_file = cp_pic_dir+old_pic_src; 
            copyfile(cp_src_file,cp_dis_file); 

        # To establish bug Document of information  
        #self.write_file_by_list("bug.txt",bug_msg); 

        # add js 
        tag = Tag(self.soup,"script"); 
        tag['type'] = "text/javascript"; 
        tag['src'] =self.js_path+"jquery.js"; 

        tag2 = Tag(self.soup,"script"); 
        tag2['type'] = "text/javascript"; 
        tag2['src'] =self.js_path+"init.js"; 

        self.soup.head.insert(2,tag2); 
        self.soup.head.insert(2,tag); 


        # Establish a template  
        self.write_file(self.build_tpl_name,self.soup); 
    # Get substituted html file      
    def get_replace_html(self,rep_id,rep_data=""): 
        ''' 
         Parameter description: replace id , replace the content (for empty use template mode replacement)  
        ''' 
        if len(rep_data) > 0 : 
            rep_str = rep_data; 
        else: 
            rep_str = "$rep_arr[{0}]".format(rep_id); 
        return "<span sty="data" id="rep_"+str(rep_id)+"">"+rep_str+"</span>"; 
    # Get the replacement array  
    def get_replace_list(self,text_arr,set_lang): 
        Sp = SentenceSpliter(); 
        Sp.SetLang(set_lang); 
        temp_sentence = []; 
        jump_i = 0; 
        for text in text_arr: 
            SList = Sp.Split(text); 
            replace_temp = ""; 
            if SList != None: 
                for item in SList: 
                    replace_temp = replace_temp+self.get_replace_html(jump_i,item); 
                    jump_i=jump_i+1; 
            else: 
                replace_temp = self.get_replace_html(jump_i,text); 
                jump_i=jump_i+1; 
            temp_sentence.append(replace_temp); 
        return temp_sentence; 
    # clauses  
    def parse_text(self,text_arr,set_lang): 
        Sp = SentenceSpliter(); 
        Sp.SetLang(set_lang); 
        temp_sentence = []; 
        for text in text_arr: 
            SList = Sp.Split(text); 
            if SList != None: 
                for item in SList: 
                    temp_sentence.append(item); 
            else: 
                temp_sentence.append(text); 

        return temp_sentence; 

    # Get a parse resource  
    def get_soup(self,parse_file): 
        try: 
            file=io.FileIO(parse_file,"r"); 
            doc = file.readall(); 
            file.close(); 
        except IOError: 
            print 'ERROR: %s file not found!' %parse_file; 
            return False; 
        # Start parsing html The document  
        return BeautifulSoup(''.join(doc)); 

if __name__ == "__main__": 
    from sys import argv, exit; 

    if len(argv) < 3: 
        print "USAGE: python %s <input-file> <output-file>" % argv[0] 
        exit(255); 

    if not isfile(argv[1]): 
        print "no such input file: %s" % argv[1] 
        exit(1) 


    paser_file = argv[1];#"html/testpic.html"; 
    tpl_file = argv[2]; 
    save_pic_path = argv[3]; 
    show_pic_path = argv[4]; 
    load_js_path = argv[5]; 
    # Parsing begins   Set the parse file, template name, image save path, image display path  
    so = build_tpl(paser_file,tpl_file,save_pic_path,show_pic_path,load_js_path); 
    # Establish a template  
    so.builder(); 
    # Save the pairs of clauses  
    so.save_data_file();

Related articles: