python USES BeautifulSoup library to analyze a simple crawler example
- 2020-11-30 08:26:00
- OfStack
A brief introduction to the functions that will be used
1, from bs4 import BeautifulSoup
[# import libraries
]2. Request header herders
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36','referer':"www.mmjpg.com" }
all_url = 'http://www.mmjpg.com/'
'User-Agent': Request way
'referer': From which link it jumps in
3. Make connections
start_html = requests.get(all_url, headers=headers)
all_url : The starting address, that is, the date of access 1 A page
headers : Request header that tells the server who is coming.
requests.get : 1 You can get a number of methods all_url The content of the page and returns the content.
4. Parse the obtained page
Soup = BeautifulSoup(start_html.text, 'lxml')
BeautifulSoup : Parse page
lxml : the parser
start_html.text : The content of the page
5. Process the retrieved page
all_a = Soup.find('div', class_='pic').find_all('a')[-2]
Soup.find () Look for STH 1 a
find_all () Find all, return 1 A list of
.find('img')['src'] : get img the src Link attributes
class__: Gets the class name of the target
div/a: Type condition is div/a the
[-2] It can be used to remove the last multiple matches, which means to remove the last two a The label
6. Get the target content
<a href =# > content </a>
a[i]/get_text(): For the first i a a The contents of the tag
7. Other functions that may be used:
1. Create and switch folders
os.makedirs(os.path.join("E:\name", filename))
# In the directory E:\name Let's create a name filename The folder
os.chdir("E:\name\\" + filename)
# Switch the work path to E:\name\filename Under the
2. File saving
f = open(name+'.jpg', 'ab')## Writing multimedia files is a must b This parameter!
f.write(img.content) ## Use multimedia files conctent !
f.close()
Case in point: Crawling the girl
import requests
from bs4 import BeautifulSoup
import os
# Import the required modules
class mzitu():
def all_url(self, url):
html = self.request(url)##
all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find_all('a')
for a in all_a:
title = a.get_text()
print('------ Start saving: ', title)
path = str(title).replace("?", '_') ## Replace the one with the one with?
self.mkdir(path) ## call mkdir Function to create folder! here path It's the title title
href = a['href']
self.html(href)
def html(self, href): ## Get the page address of the image
html = self.request(href)
max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
# This is mentioned above
for page in range(1, int(max_span) + 1):
page_url = href + '/' + str(page)
self.img(page_url) ## call img function
def img(self, page_url): ## Handle the image page address to get the actual address of the image
img_html = self.request(page_url)
img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src']
self.save(img_url)
def save(self, img_url): ## Save the picture
name = img_url[-9:-4]
img = self.request(img_url)
f = open(name + '.jpg', 'ab')
f.write(img.content)
f.close()
def mkdir(self, path): ## Create folder
path = path.strip()
isExists = os.path.exists(os.path.join("E:\mzitu2", path))
if not isExists:
print(' built 1 It's called ', path, ' Folder! ')
os.makedirs(os.path.join("E:\mzitu2", path))
os.chdir(os.path.join("E:\mzitu2", path)) ## Go to directory
return True
else:
print( path, ' The folder already exists! ')
return False
def request(self, url): ## This function gets the web page response And then return
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
'referer':# fake 1 Six access sources "http://www.mzitu.com/100260/2"
}
content = requests.get(url, headers=headers)
return content
# Set the startup function
def main():
Mzitu = mzitu() ## instantiation
Mzitu.all_url('http://www.mzitu.com/all') ## To the function all_url Incoming parameters
main()