This article takes you to understand the four common basic crawler methods of Python

2021-08-12 03:06:22
OfStack

1. Urllib method

Urllib is the HTTP request library built into python


import urllib.request
#1. Positioning and grasping url
url='http://www.baidu.com/'
#2. To the target url Send a request 
response=urllib.request.urlopen(url)
#3. Read data 
data=response.read()
# print(data) # The printed data are ASCII Code 
print(data.decode('utf-8')) #decode Convert the data in the corresponding encoding format into a string


#post Request 
import urllib.parse
url='http://www.iqianyue.com/mypost/'
# Build uploaded data
postdata=urllib.parse.urlencode({
 'name':'Jack',
 'pass':'123456'
}).encode('utf-8') # String to byte stream data 
html=urllib.request.urlopen(url,data=postdata).read()
print(html)


#headers Anti-crawling mechanism for checking header information 
import urllib.request
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
request1=urllib.request.Request('https://www.dianping.com/',headers=headers)#Request Class constructs the 1 Complete request 
response1=urllib.request.urlopen(request1).read()
print(response1.decode('utf-8'))


# Timeout setting + Exception handling 
import urllib.request
import urllib.error
for i in range(20):
 try:
  response1=urllib.request.urlopen('http://www.ibeifeng.com/',timeout=0.01)
  print('a')
 except urllib.error.URLError as e:
  print(e)
 except BaseException as a: # Base class for all exceptions 
  print(a)

2. requests method

Requests is written in python language based on urllib, using Apache2 Licensed open source protocol HTTP library
urllib is still very inconvenient, while Requests will be more convenient than urllib, which can save us a lot of work.
requests is the simplest and easiest HTTP library implemented by python. It is recommended that crawlers use requests library.
After python is installed by default, requests module is not installed, so it needs to be installed separately through pip


import requests
#get Request 
r=requests.get('https://www.taobao.com/')
# Print byte stream data 
# print(r.content)
# print(r.content.decode('utf-8')) # Transcoding 
print(r.text) # Print text data 

import chardet
# Automatically get the page code and return the dictionary type 
print(chardet.detect(r.content))


POST Request to implement mock form login 
import requests
# Building Data Uploaded to Web Pages 
data={
 'name':'Jack',
 'pass':'123456'
}
# Send request with login data 
r=requests.post('http://www.iqianyue.com/mypost/',data=data)
print(r.text) # Print request data 
# Will the logged-in html Store locally 
f=open('login.html','wb')
f.write(r.content) # Write byte stream data 
f.close()


# Anti-crawling mechanism for checking header information headers
import requests
# Build headers
headers={
 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
r=requests.get('https://www.dianping.com/',headers=headers)
print(r.text)
print(r.status_code) # Status 403  Be intercepted ( View status )


#cookies
# Skip login and get resources 
import requests
f=open('cookie.txt','r') # Open cookie Documents 
# Initialization cookies , statement 1 An empty dictionary 
cookies={}
# By character   ;   Cut and read, return list data, and then traverse 
#split() : Cut function  strip() Remove white space before and after strings 
for line in f.read().split(';'):
 #split Set the parameter to 1 Cut the string into two parts 
 name,value=line.strip().split('=',1)
 # Empty dictionary cookies Add content 
 cookies[name]=value
r=requests.get('http://www.baidu.com',cookies=cookies)
data=r.text
f1=open('baidu.html','w',encoding='utf-8')
f1.write(data)
f1.close()


# Set up proxy (website search free proxy ip ) 
# Solve the web page seal IP The question of 
import requests
proxies={
 #' Agreement ':'ip: Port number '
 'HTTP':'222.83.160.37 : 61205'
}
req=requests.get('http://www.taobao.com/',proxies=proxies)
print(req.text)

# Set timeout 
import requests
from requests.exceptions import Timeout
try:
 response = requests.get("http://www.ibeifeng.com ", timeout=0.01)
 print(response.status_code)
except Timeout:
 print(' Access timeout! ')

3. Analysis of BS4-BeautifulSoup4


from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" rel="external nofollow" rel="external nofollow" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" rel="external nofollow" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" rel="external nofollow" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
# #创建1个BS对象
soup=BeautifulSoup(html,'html.parser') #html.parser默认解析器
print(type(soup))
# 结构化输出
print(soup.prettify())
#1获取标签(只能获取第1条对应的标签)
print(soup.p) #获取p标签
print(soup.a) #获取a标签
print(soup.title) #获取title
#2获取标签内容
print(soup.title.string)
print(soup.a.string)
print(soup.body.string) #如果标签中有多个子标签返回None
print(soup.head.string) #如果标签中有1个子标签返回子标签里的文本
#3获取属性
print(soup.a.attrs) #返回字典
print(soup.a['id']) #得到指定属性值
#4操作字节点
print(soup.p.contents) #得到标签下所有子节点
print(soup.p.children) #得到标签下所有子节点的迭代对象
#5操作父节点
print(soup.p.parent) #得到标签p的父节点其内部的所有内容
print(soup.p.parents) # 得到标签p的父节点的迭代对象
#6操作兄弟节点(同级的节点)
#next_sibling和previous_sibling分别获取节点的下1个和上1个兄弟元素
print(soup.a.next_sibling)
print(soup.a.previous_sibling)

#2.搜索文档数
#1标签名
#查询所有a标签
res1=soup.find_all('a')
print(res1)
#获取所有a标签下属性为class="sister"的标签（
#使用 class 做参数会导致语法错误，这里也要用class_）
print(soup.find_all('a',class_="sister"))
#2正则表达式
import re
#查询所有包含d字符的标签
res2=soup.find_all(re.compile('d+'))
print(res2)
#3列表
#查找所有的title标签和a标签
res3=soup.find_all(['title','a'])
print(res3)
#4关键词
#查询属性id='link1'的标签
res4=soup.find_all(id='link1')
print(res4)
#5内容匹配
res5=soup.find_all(text='Tillie') #文本匹配
res55=soup.find_all(text=re.compile('Dormouse'))
print(res55)
#6嵌套选择
print(soup.find_all('p'))
#查看所有p标签下所有的a标签
for i in soup.find_all('p'):
 print(i.find_all('a'))

#3.CSS选择器
#1根据标签查询对象
res6=soup.select('a') #返回列表
print(res6) #得到所有的a标签
#2根据ID属性查询标签对象(id用#)
print(soup.select('#link2'))
#3根据class属性查询标签对象(class用.)
print(soup.select('.sister'))
print(soup.select('.sister')[2].get_text()) #获取文本内容
#4属性选择(获取a标签里=href属性值的标签)
print(soup.select('a[href="http://example.com/elsie" rel="external nofollow" rel="external nofollow" ]'))
#5包含选择(获取)
print(soup.select('p a#link1'))
#6并列选择
print(soup.select('a#link1,a#link2'))
#7得到标签内容
res7=soup.select('p a.sister')
for i in res7:
 print(i.get_text())


#post Request 
import urllib.parse
url='http://www.iqianyue.com/mypost/'
# Build uploaded data
postdata=urllib.parse.urlencode({
 'name':'Jack',
 'pass':'123456'
}).encode('utf-8') # String to byte stream data 
html=urllib.request.urlopen(url,data=postdata).read()
print(html)

4. XPath syntax

XPath is a language for finding information in XML documents.
XPath can be used to traverse elements and attributes in XML documents


#post Request 
import urllib.parse
url='http://www.iqianyue.com/mypost/'
# Build uploaded data
postdata=urllib.parse.urlencode({
 'name':'Jack',
 'pass':'123456'
}).encode('utf-8') # String to byte stream data 
html=urllib.request.urlopen(url,data=postdata).read()
print(html)