python USES BeautifulSoup library to analyze a simple crawler example

  • 2020-11-30 08:26:00
  • OfStack

A brief introduction to the functions that will be used

1, from bs4 import BeautifulSoup

[

# import libraries

]

2. Request header herders


headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36','referer':"www.mmjpg.com" }
all_url = 'http://www.mmjpg.com/' 
'User-Agent': Request way  
'referer': From which link it jumps in 

3. Make connections


start_html = requests.get(all_url, headers=headers)
all_url : The starting address, that is, the date of access 1 A page 
headers : Request header that tells the server who is coming. 
requests.get : 1 You can get a number of methods all_url The content of the page and returns the content. 

4. Parse the obtained page


Soup = BeautifulSoup(start_html.text, 'lxml')
BeautifulSoup : Parse page 
lxml : the parser 
start_html.text : The content of the page 

5. Process the retrieved page


all_a = Soup.find('div', class_='pic').find_all('a')[-2]
Soup.find () Look for STH 1 a 
find_all () Find all, return 1 A list of 
.find('img')['src']   : get img the src Link attributes   
class__: Gets the class name of the target 
div/a: Type condition is div/a the 
[-2] It can be used to remove the last multiple matches, which means to remove the last two a The label 

6. Get the target content


<a href =# > content </a>
a[i]/get_text(): For the first i a a The contents of the tag 

7. Other functions that may be used:

1. Create and switch folders


os.makedirs(os.path.join("E:\name", filename))
# In the directory E:\name Let's create a name filename The folder 
os.chdir("E:\name\\" + filename)
# Switch the work path to E:\name\filename Under the 

2. File saving


f = open(name+'.jpg', 'ab')## Writing multimedia files is a must  b  This parameter! 
f.write(img.content) ## Use multimedia files conctent ! 
f.close()

Case in point: Crawling the girl


  
import requests
from bs4 import BeautifulSoup
import os
# Import the required modules 
class mzitu():
  def all_url(self, url):
    html = self.request(url)##
    all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find_all('a')
    for a in all_a:
      title = a.get_text()
      print('------ Start saving: ', title) 
      path = str(title).replace("?", '_') ## Replace the one with the one with? 
      self.mkdir(path) ## call mkdir Function to create folder! here path It's the title title
      href = a['href']
      self.html(href) 

  def html(self, href):  ## Get the page address of the image 
    html = self.request(href)
    max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
    # This is mentioned above 
    for page in range(1, int(max_span) + 1):
      page_url = href + '/' + str(page)
      self.img(page_url) ## call img function 

  def img(self, page_url): ## Handle the image page address to get the actual address of the image 
    img_html = self.request(page_url)
    img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src']
    self.save(img_url)

  def save(self, img_url): ## Save the picture 
    name = img_url[-9:-4]
    img = self.request(img_url)
    f = open(name + '.jpg', 'ab')
    f.write(img.content)
    f.close()

  def mkdir(self, path): ## Create folder 
    path = path.strip()
    isExists = os.path.exists(os.path.join("E:\mzitu2", path))
    if not isExists:
      print(' built 1 It's called ', path, ' Folder! ')
      os.makedirs(os.path.join("E:\mzitu2", path))
      os.chdir(os.path.join("E:\mzitu2", path)) ## Go to directory 
      return True
    else:
      print( path, ' The folder already exists! ')
      return False

  def request(self, url): ## This function gets the web page response  And then return 
    headers = {
      'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
      'referer':# fake 1 Six access sources  "http://www.mzitu.com/100260/2"
    }
    content = requests.get(url, headers=headers)
    return content
# Set the startup function 
def main():
  Mzitu = mzitu() ## instantiation 
  Mzitu.all_url('http://www.mzitu.com/all') ## To the function all_url Incoming parameters  

main()


Related articles: