Python implements netease web crawler function example that can obtain all text information on netease page

  • 2020-06-23 01:08:05
  • OfStack

This paper describes the example of Python to achieve netease web crawler function that can obtain all text information on netease page. To share for your reference, specific as follows:


#coding=utf-8
#---------------------------------------
#   Program: netease crawler 
#   The author: ewang
#   Date: 2016-7-6
#   Language: Python 2.7
#   Function: get the text information in netease page and save to TXT File. 
#---------------------------------------
import string
import urllib2
import re
import os
class WangYi_Spider:
  # Declare related attributes 
  def __init__(self):
    # to wangyiUrl Attribute assignment 
    self.wangyiUrl="http://www.163.com/"
    # Used to save text information on a page 
    self.pageinfor=[]
    print u' Netease has started crawler, crawl ...'
  # Initializes the load page and stores its transcoding 
  def wangyi(self):
    # Read the original information of the page and pull it from gbk transcoding 
    Page=urllib2.urlopen(self.wangyiUrl).read().decode('gbk')
    # Get the page title 
    title=self.find_title(Page)
    print u' Web site name :'+title
    # Get Chinese text information on the page 
    self.save_infor(title)
  # Find the page title 
  def find_title(self,page):
    # matching <title>xxxx</title>
    myTitle=re.search(r'<title>(.*?)</title>',page,re.S)
    # Initializing a title is called no title yet 
    title=u' No title '
    # Assign a title to if the title exists title
    if myTitle:
      #(.*?) This is called 1 a group From that group 1 start 
      title=myTitle.group(1)
    else:
      print u' Crawler report: unable to load web page title ...'
    return title
  # Save page information 
  def save_infor(self,title):
    # Load the page text information into the array 
    self.get_infor()
    # Create and open the local file 
    f=open(title+'.txt','w+')
    # Writes the obtained page information to a file 
    f.writelines(self.pageinfor)
    # Close the open file 
    f.close()
    print u' Crawler report: file '+title+'.txt'+u' Have downloaded :'+os.getcwd()
    print u' Press any key to exit ...'
    raw_input()
  # Get the source of the page and store it in an array 
  def get_infor(self):
    # Get the source code in the page 
    page=urllib2.urlopen(self.wangyiUrl).read()
    # Put the content in the page gbk Decode and get all the text information in the page 
    self.deal_infor(page.decode('gbk'))
  # Get the required document information from the page code 
  def deal_infor(self,page):
    # To obtain <em >XXX</em> Text message of XXX
    emTagItems=re.findall("<em.*?>(\W+?)</em>",page,re.S)
    # To obtain <span>XXXX</a> Text message of XXXX
    spanTagItems=re.findall("<span>(\W+?)</span>",page,re.S)
    # To obtain <a .*>XXXX</a> Text message of XXXX
    aTagItems=re.findall("<a.*?>(\W+?)</a>",page,re.S)
    # the em tag The text information is added to the array pageinfor In the 
    for emItem in emTagItems:
      # Use for the obtained text information gbk coding 
      self.pageinfor.append(emItem.encode('gbk')+'\n')
    # the span tag The text information is added to the array pageinfor In the 
    for spanItem in spanTagItems:
      # Use for the obtained text information gbk coding 
      self.pageinfor.append(spanItem.encode('gbk')+'\n')
    # the a tag The text information is added to the array pageinfor In the 
    for aItem in aTagItems:
      # Use for the obtained text information gbk coding 
      self.pageinfor.append(aItem.encode('gbk')+'\n')
#------------ Program entry ----------------
print u"""#---------------------------------------
#   Program: netease crawler 
#   The author: ewang
#   Date: 2016-7-6
#   Language: Python 2.7
#   Function: get the text information in netease page and save to TXT In the file 
#--------------------------------------------------
"""
wangyiSpider=WangYi_Spider()
wangyiSpider.wangyi()

More about Python related content to view this site project: the Python Socket programming skills summary ", "Python regular expression usage summary", "Python data structure and algorithm tutorial", "Python function using techniques", "Python string skills summary", "Python introduction and advanced tutorial" and "Python file and directory skills summary"

I hope this article has been helpful in Python programming.


Related articles: