Python implements netease web crawler function example that can obtain all text information on netease page
- 2020-06-23 01:08:05
- OfStack
This paper describes the example of Python to achieve netease web crawler function that can obtain all text information on netease page. To share for your reference, specific as follows:
#coding=utf-8
#---------------------------------------
# Program: netease crawler
# The author: ewang
# Date: 2016-7-6
# Language: Python 2.7
# Function: get the text information in netease page and save to TXT File.
#---------------------------------------
import string
import urllib2
import re
import os
class WangYi_Spider:
# Declare related attributes
def __init__(self):
# to wangyiUrl Attribute assignment
self.wangyiUrl="http://www.163.com/"
# Used to save text information on a page
self.pageinfor=[]
print u' Netease has started crawler, crawl ...'
# Initializes the load page and stores its transcoding
def wangyi(self):
# Read the original information of the page and pull it from gbk transcoding
Page=urllib2.urlopen(self.wangyiUrl).read().decode('gbk')
# Get the page title
title=self.find_title(Page)
print u' Web site name :'+title
# Get Chinese text information on the page
self.save_infor(title)
# Find the page title
def find_title(self,page):
# matching <title>xxxx</title>
myTitle=re.search(r'<title>(.*?)</title>',page,re.S)
# Initializing a title is called no title yet
title=u' No title '
# Assign a title to if the title exists title
if myTitle:
#(.*?) This is called 1 a group From that group 1 start
title=myTitle.group(1)
else:
print u' Crawler report: unable to load web page title ...'
return title
# Save page information
def save_infor(self,title):
# Load the page text information into the array
self.get_infor()
# Create and open the local file
f=open(title+'.txt','w+')
# Writes the obtained page information to a file
f.writelines(self.pageinfor)
# Close the open file
f.close()
print u' Crawler report: file '+title+'.txt'+u' Have downloaded :'+os.getcwd()
print u' Press any key to exit ...'
raw_input()
# Get the source of the page and store it in an array
def get_infor(self):
# Get the source code in the page
page=urllib2.urlopen(self.wangyiUrl).read()
# Put the content in the page gbk Decode and get all the text information in the page
self.deal_infor(page.decode('gbk'))
# Get the required document information from the page code
def deal_infor(self,page):
# To obtain <em >XXX</em> Text message of XXX
emTagItems=re.findall("<em.*?>(\W+?)</em>",page,re.S)
# To obtain <span>XXXX</a> Text message of XXXX
spanTagItems=re.findall("<span>(\W+?)</span>",page,re.S)
# To obtain <a .*>XXXX</a> Text message of XXXX
aTagItems=re.findall("<a.*?>(\W+?)</a>",page,re.S)
# the em tag The text information is added to the array pageinfor In the
for emItem in emTagItems:
# Use for the obtained text information gbk coding
self.pageinfor.append(emItem.encode('gbk')+'\n')
# the span tag The text information is added to the array pageinfor In the
for spanItem in spanTagItems:
# Use for the obtained text information gbk coding
self.pageinfor.append(spanItem.encode('gbk')+'\n')
# the a tag The text information is added to the array pageinfor In the
for aItem in aTagItems:
# Use for the obtained text information gbk coding
self.pageinfor.append(aItem.encode('gbk')+'\n')
#------------ Program entry ----------------
print u"""#---------------------------------------
# Program: netease crawler
# The author: ewang
# Date: 2016-7-6
# Language: Python 2.7
# Function: get the text information in netease page and save to TXT In the file
#--------------------------------------------------
"""
wangyiSpider=WangYi_Spider()
wangyiSpider.wangyi()
More about Python related content to view this site project: the Python Socket programming skills summary ", "Python regular expression usage summary", "Python data structure and algorithm tutorial", "Python function using techniques", "Python string skills summary", "Python introduction and advanced tutorial" and "Python file and directory skills summary"
I hope this article has been helpful in Python programming.