requests and lxml implement the crawler method
- 2020-06-03 07:04:48
- OfStack
As shown below:
# requests module to request a page
# html for lxml Module Building selector Selectors (Formatting response response)
# from lxml import html
# import requests
# response = requests.get(url).content
# selector = html.formatstring(response)
# hrefs = selector.xpath('/html/body//div[@class='feed-item _j_feed_item']/a/@href')
# to url = 'https: / / www. mafengwo. cn/gonglve/ziyouxing / 2033. html' for example
# python 2.7
import requests
from lxml import html
import os
# Get the neutron page on the first page url link
def get_page_urls(url):
response = requests.get(url).content
# through lxml the html To build the selector
selector = html.fromstring(response)
urls = []
for i in selector.xpath("/html/body//div[@class='feed-item _j_feed_item']/a/@href"):
urls.append(i)
return urls
# get title from a child's html(div[@class='title'])
def get_page_a_title(url):
'''url is ziyouxing's a@href'''
response = requests.get(url).content
selector = html.fromstring(response)
# get xpath by chrome's tool --> /html/body//div[@class='title']/text()
a_title = selector.xpath("/html/body//div[@class='title']/text()")
return a_title
# Gets the page selector ( through lxml the html build )
def get_selector(url):
response = requests.get(url).content
selector = html.fromstring(response)
return selector
# through chrome Developer tool analysis html After the page structure is found, we need to get the text content mainly displayed in div[@class='l-topic'] and div[@class='p-section'] In the
# Gets the required text content
def get_page_content(selector):
# /html/body/div[2]/div[2]/div[1]/div[@class='l-topic']/p/text()
page_title = selector.xpath("//div[@class='l-topic']/p/text()")
# /html/body/div[2]/div[2]/div[1]/div[2]/div[15]/div[@class='p-section']/text()
page_content = selector.xpath("//div[@class='p-section']/text()")
return page_title,page_content
# Get the image on the page url address
def get_image_urls(selector):
imagesrcs = selector.xpath("//img[@class='_j_lazyload']/@src")
return imagesrcs
# Gets the title of the image
def get_image_title(selector, num)
# num from 2 The start of the
url = "/html/body/div[2]/div[2]/div[1]/div[2]/div["+num+"]/span[@class='img-an']/text()"
if selector.xpath(url) is not None:
image_title = selector.xpath(url)
else:
image_title = "map"+str(num) # There have been a 1 a
return image_title
# Download the pictures
def downloadimages(selector,number):
'''number It's for counting '''
urls = get_image_urls()
num = 2
amount = len(urls)
for url in urls:
image_title = get_image_title(selector, num)
filename = "/home/WorkSpace/tour/words/result"+number+"/+"image_title+".jpg"
if not os.path.exists(filename):
os.makedirs(filename)
print('downloading %s image %s' %(number, image_title))
with open(filename, 'wb') as f:
f.write(requests.get(url).content)
num += 1
print " It's already downloaded %s picture " %num
# Entry, start and store the acquired data in a file
if __name__ =='__main__':
url = 'https://www.mafengwo.cn/gonglve/ziyouxing/2033.html'
urls = get_page_urls(url)
# turn to get response from html
number = 1
for i in urls:
selector = get_selector(i)
# download images
downloadimages(selector,number)
# get text and write into a file
page_title, page_content = get_page_content(selector)
result = page_title+'\n'+page_content+'\n\n'
path = "/home/WorkSpace/tour/words/result"+num+"/"
if not os.path.exists(filename):
os.makedirs(filename)
filename = path + "num"+".txt"
with open(filename,'wb') as f:
f.write(result)
print result
This is the end of the crawler, before crawling page 1 must carefully analyze html structure, some pages are generated by js, the page is relatively simple, does not involve the processing of js, in the future essay will be related to share