requests and lxml implement the crawler method

  • 2020-06-03 07:04:48
  • OfStack

As shown below:

# requests module to request a page
# html for lxml Module Building selector Selectors (Formatting response response)
# from lxml import html
# import requests

# response = requests.get(url).content

# selector = html.formatstring(response)

# hrefs = selector.xpath('/html/body//div[@class='feed-item _j_feed_item']/a/@href')

# to url = 'https: / / www. mafengwo. cn/gonglve/ziyouxing / 2033. html' for example


# python 2.7
import requests
from lxml import html
import os

#  Get the neutron page on the first page url link 
def get_page_urls(url):
  response = requests.get(url).content
  #  through lxml the html To build the selector 
  selector = html.fromstring(response)
  urls = []
  for i in selector.xpath("/html/body//div[@class='feed-item _j_feed_item']/a/@href"):
    urls.append(i)
  return urls

# get title from a child's html(div[@class='title'])
def get_page_a_title(url):
  '''url is ziyouxing's a@href'''
  response = requests.get(url).content
  selector = html.fromstring(response)
  # get xpath by chrome's tool --> /html/body//div[@class='title']/text()
  a_title = selector.xpath("/html/body//div[@class='title']/text()")
  return a_title

#  Gets the page selector ( through lxml the html build )
def get_selector(url):
  response = requests.get(url).content
  selector = html.fromstring(response)
  return selector

#  through chrome Developer tool analysis html After the page structure is found, we need to get the text content mainly displayed in div[@class='l-topic'] and div[@class='p-section'] In the 

#  Gets the required text content 
 def get_page_content(selector):
   # /html/body/div[2]/div[2]/div[1]/div[@class='l-topic']/p/text()
   page_title = selector.xpath("//div[@class='l-topic']/p/text()")
   # /html/body/div[2]/div[2]/div[1]/div[2]/div[15]/div[@class='p-section']/text()
   page_content = selector.xpath("//div[@class='p-section']/text()")
   return page_title,page_content

#  Get the image on the page url address 
def get_image_urls(selector):
  imagesrcs = selector.xpath("//img[@class='_j_lazyload']/@src")
  return imagesrcs

#  Gets the title of the image 

def get_image_title(selector, num)
  # num  from 2 The start of the 
  url = "/html/body/div[2]/div[2]/div[1]/div[2]/div["+num+"]/span[@class='img-an']/text()"
  if selector.xpath(url) is not None:
    image_title = selector.xpath(url)
  else:
    image_title = "map"+str(num) #  There have been a 1 a 
  return image_title

#  Download the pictures 

def downloadimages(selector,number):
  '''number It's for counting '''
  urls = get_image_urls()
  num = 2
  amount = len(urls)
  for url in urls:
    image_title = get_image_title(selector, num)
    filename = "/home/WorkSpace/tour/words/result"+number+"/+"image_title+".jpg"
    if not os.path.exists(filename):
      os.makedirs(filename)
    print('downloading %s image %s' %(number, image_title))
    with open(filename, 'wb') as f:
      f.write(requests.get(url).content)
    num += 1
  print " It's already downloaded %s picture " %num

#  Entry, start and store the acquired data in a file 
if __name__ =='__main__':
  url = 'https://www.mafengwo.cn/gonglve/ziyouxing/2033.html'
  urls = get_page_urls(url)
  # turn to get response from html
  number = 1
  for i in urls:
    selector = get_selector(i)
    # download images
    downloadimages(selector,number)
    # get text and write into a file
    page_title, page_content = get_page_content(selector)
    result = page_title+'\n'+page_content+'\n\n'
    path = "/home/WorkSpace/tour/words/result"+num+"/"
    if not os.path.exists(filename):
      os.makedirs(filename)
    filename = path + "num"+".txt"
    with open(filename,'wb') as f:
      f.write(result)
    print result

This is the end of the crawler, before crawling page 1 must carefully analyze html structure, some pages are generated by js, the page is relatively simple, does not involve the processing of js, in the future essay will be related to share


Related articles: