# coding=utf-8
from BeautifulSoup import BeautifulSoup, Tag, NavigableString
from SentenceSpliter import SentenceSpliter
from os.path import basename,dirname,isdir,isfile
from os import makedirs
from shutil import copyfile
import io
import time
import re
class build_tpl:
def __init__(self,parse_file,build_tpl_name,cp_pic_dir,show_pic_dir,js_path,set_lang=2052):
''' Parameter description: resolve the file name, template name, save the picture path, picture display path, js Path, current language (clause usage) '''
# Gets the directory path to the parsed file
if len(dirname(parse_file))>1:
self.cur_dir = dirname(parse_file)+"/";
else:
self.cur_dir ="./";
# Creates the template file name
self.build_tpl_name = build_tpl_name;
# The picture cp To the directory
self.cp_pic_dir = cp_pic_dir;
# through http A directory showing pictures
self.show_pic_dir = show_pic_dir;
# loading js The path of the
self.js_path = js_path;
# Other section of the group
self.get_text_arr = [];
# Array of current image names
self.cur_pic_arr = [];
# Parse the file achieve soup resources
self.soup = self.get_soup(parse_file);
# achieve html In a document, a segment document
self.get_text_arr = self.soup.body.findAll(text=lambda(x): len(x.strip()) > 0);
# In words of
self.get_sentence_arr = self.parse_text(self.get_text_arr,set_lang);
# Get the replacement array
self.replace_list = self.get_replace_list(self.get_text_arr,set_lang);
# Get an array of images
self.cur_pic_arr = self.soup.findAll('img');
#self.write_file_by_list("no.txt",self.get_text_arr);
#self.write_file_by_list("yes.txt",self.get_sentence_arr);
# Save the phrase to the file
def save_data_file(self):
file_name = self.build_tpl_name+".data";
self.write_file_by_list(file_name,self.get_data());
# Make phrases
def get_data(self):
return self.get_sentence_arr;
# The array is written to the document
def write_file_by_list(self,file_name,write_arr):
file=io.FileIO(file_name,"w");
file.write(('n'.join(write_arr)).encode('utf-8'));
file.close();
# The string is written to the document
def write_file(self,file_name,file_contents):
file=io.FileIO(file_name,"w");
file.write(file_contents.encode('utf-8'));
file.close();
# Set up pictures hash directory
def get_pic_hash(self):
return time.strftime("%Y/%m/%d/");
# Create template files
def builder(self):
# The word that failed to be substituted
bug_msg = [];
# Replace the content template
for i in range(len(self.get_text_arr)):
# replace
rep_str = "$rep_arr[{0}]".format(i);
try:
self.soup.body.find(text=self.get_text_arr[i]).replaceWith(self.replace_list[i]);
except AttributeError:
bug_msg.append(self.get_text_arr[i]);
# Obtain images hash The path
hash_dir = self.get_pic_hash();
# Construct a display image path
show_pic_dir = self.show_pic_dir+hash_dir;
# Construct the image save path
cp_pic_dir = self.cp_pic_dir+hash_dir;
# Determines if the directory to save the image exists Nonexistent establishment
if not isdir(cp_pic_dir):
makedirs(cp_pic_dir);
for pic_name in self.cur_pic_arr:
# Replace the image path
old_pic_src = pic_name['src'];
pic_name['src'] = show_pic_dir+old_pic_src;
# Copy pictures
cp_src_file = self.cur_dir+old_pic_src;
cp_dis_file = cp_pic_dir+old_pic_src;
copyfile(cp_src_file,cp_dis_file);
# To establish bug Document of information
#self.write_file_by_list("bug.txt",bug_msg);
# add js
tag = Tag(self.soup,"script");
tag['type'] = "text/javascript";
tag['src'] =self.js_path+"jquery.js";
tag2 = Tag(self.soup,"script");
tag2['type'] = "text/javascript";
tag2['src'] =self.js_path+"init.js";
self.soup.head.insert(2,tag2);
self.soup.head.insert(2,tag);
# Establish a template
self.write_file(self.build_tpl_name,self.soup);
# Get substituted html file
def get_replace_html(self,rep_id,rep_data=""):
'''
Parameter description: replace id , replace the content (for empty use template mode replacement)
'''
if len(rep_data) > 0 :
rep_str = rep_data;
else:
rep_str = "$rep_arr[{0}]".format(rep_id);
return "<span sty="data" id="rep_"+str(rep_id)+"">"+rep_str+"</span>";
# Get the replacement array
def get_replace_list(self,text_arr,set_lang):
Sp = SentenceSpliter();
Sp.SetLang(set_lang);
temp_sentence = [];
jump_i = 0;
for text in text_arr:
SList = Sp.Split(text);
replace_temp = "";
if SList != None:
for item in SList:
replace_temp = replace_temp+self.get_replace_html(jump_i,item);
jump_i=jump_i+1;
else:
replace_temp = self.get_replace_html(jump_i,text);
jump_i=jump_i+1;
temp_sentence.append(replace_temp);
return temp_sentence;
# clauses
def parse_text(self,text_arr,set_lang):
Sp = SentenceSpliter();
Sp.SetLang(set_lang);
temp_sentence = [];
for text in text_arr:
SList = Sp.Split(text);
if SList != None:
for item in SList:
temp_sentence.append(item);
else:
temp_sentence.append(text);
return temp_sentence;
# Get a parse resource
def get_soup(self,parse_file):
try:
file=io.FileIO(parse_file,"r");
doc = file.readall();
file.close();
except IOError:
print 'ERROR: %s file not found!' %parse_file;
return False;
# Start parsing html The document
return BeautifulSoup(''.join(doc));
if __name__ == "__main__":
from sys import argv, exit;
if len(argv) < 3:
print "USAGE: python %s <input-file> <output-file>" % argv[0]
exit(255);
if not isfile(argv[1]):
print "no such input file: %s" % argv[1]
exit(1)
paser_file = argv[1];#"html/testpic.html";
tpl_file = argv[2];
save_pic_path = argv[3];
show_pic_path = argv[4];
load_js_path = argv[5];
# Parsing begins Set the parse file, template name, image save path, image display path
so = build_tpl(paser_file,tpl_file,save_pic_path,show_pic_path,load_js_path);
# Establish a template
so.builder();
# Save the pairs of clauses
so.save_data_file();