This article takes you to understand the four common basic crawler methods of Python
- 2021-08-12 03:06:22
- OfStack
1. Urllib method
Urllib is the HTTP request library built into python
import urllib.request
#1. Positioning and grasping url
url='http://www.baidu.com/'
#2. To the target url Send a request
response=urllib.request.urlopen(url)
#3. Read data
data=response.read()
# print(data) # The printed data are ASCII Code
print(data.decode('utf-8')) #decode Convert the data in the corresponding encoding format into a string
#post Request
import urllib.parse
url='http://www.iqianyue.com/mypost/'
# Build uploaded data
postdata=urllib.parse.urlencode({
'name':'Jack',
'pass':'123456'
}).encode('utf-8') # String to byte stream data
html=urllib.request.urlopen(url,data=postdata).read()
print(html)
#headers Anti-crawling mechanism for checking header information
import urllib.request
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
request1=urllib.request.Request('https://www.dianping.com/',headers=headers)#Request Class constructs the 1 Complete request
response1=urllib.request.urlopen(request1).read()
print(response1.decode('utf-8'))
# Timeout setting + Exception handling
import urllib.request
import urllib.error
for i in range(20):
try:
response1=urllib.request.urlopen('http://www.ibeifeng.com/',timeout=0.01)
print('a')
except urllib.error.URLError as e:
print(e)
except BaseException as a: # Base class for all exceptions
print(a)
2. requests method
Requests is written in python language based on urllib, using Apache2 Licensed open source protocol HTTP library
urllib is still very inconvenient, while Requests will be more convenient than urllib, which can save us a lot of work.
requests is the simplest and easiest HTTP library implemented by python. It is recommended that crawlers use requests library.
After python is installed by default, requests module is not installed, so it needs to be installed separately through pip
import requests
#get Request
r=requests.get('https://www.taobao.com/')
# Print byte stream data
# print(r.content)
# print(r.content.decode('utf-8')) # Transcoding
print(r.text) # Print text data
import chardet
# Automatically get the page code and return the dictionary type
print(chardet.detect(r.content))
POST Request to implement mock form login
import requests
# Building Data Uploaded to Web Pages
data={
'name':'Jack',
'pass':'123456'
}
# Send request with login data
r=requests.post('http://www.iqianyue.com/mypost/',data=data)
print(r.text) # Print request data
# Will the logged-in html Store locally
f=open('login.html','wb')
f.write(r.content) # Write byte stream data
f.close()
# Anti-crawling mechanism for checking header information headers
import requests
# Build headers
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
r=requests.get('https://www.dianping.com/',headers=headers)
print(r.text)
print(r.status_code) # Status 403 Be intercepted ( View status )
#cookies
# Skip login and get resources
import requests
f=open('cookie.txt','r') # Open cookie Documents
# Initialization cookies , statement 1 An empty dictionary
cookies={}
# By character ; Cut and read, return list data, and then traverse
#split() : Cut function strip() Remove white space before and after strings
for line in f.read().split(';'):
#split Set the parameter to 1 Cut the string into two parts
name,value=line.strip().split('=',1)
# Empty dictionary cookies Add content
cookies[name]=value
r=requests.get('http://www.baidu.com',cookies=cookies)
data=r.text
f1=open('baidu.html','w',encoding='utf-8')
f1.write(data)
f1.close()
# Set up proxy (website search free proxy ip )
# Solve the web page seal IP The question of
import requests
proxies={
#' Agreement ':'ip: Port number '
'HTTP':'222.83.160.37 : 61205'
}
req=requests.get('http://www.taobao.com/',proxies=proxies)
print(req.text)
# Set timeout
import requests
from requests.exceptions import Timeout
try:
response = requests.get("http://www.ibeifeng.com ", timeout=0.01)
print(response.status_code)
except Timeout:
print(' Access timeout! ')
3. Analysis of BS4-BeautifulSoup4
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" rel="external nofollow" rel="external nofollow" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" rel="external nofollow" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" rel="external nofollow" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# #创建1个BS对象
soup=BeautifulSoup(html,'html.parser') #html.parser默认解析器
print(type(soup))
# 结构化输出
print(soup.prettify())
#1获取标签(只能获取第1条对应的标签)
print(soup.p) #获取p标签
print(soup.a) #获取a标签
print(soup.title) #获取title
#2获取标签内容
print(soup.title.string)
print(soup.a.string)
print(soup.body.string) #如果标签中有多个子标签返回None
print(soup.head.string) #如果标签中有1个子标签返回子标签里的文本
#3获取属性
print(soup.a.attrs) #返回字典
print(soup.a['id']) #得到指定属性值
#4操作字节点
print(soup.p.contents) #得到标签下所有子节点
print(soup.p.children) #得到标签下所有子节点的迭代对象
#5操作父节点
print(soup.p.parent) #得到标签p的父节点其内部的所有内容
print(soup.p.parents) # 得到标签p的父节点的迭代对象
#6操作兄弟节点(同级的节点)
#next_sibling和previous_sibling分别获取节点的下1个和上1个兄弟元素
print(soup.a.next_sibling)
print(soup.a.previous_sibling)
#2.搜索文档数
#1标签名
#查询所有a标签
res1=soup.find_all('a')
print(res1)
#获取所有a标签下属性为class="sister"的标签(
#使用 class 做参数会导致语法错误,这里也要用class_)
print(soup.find_all('a',class_="sister"))
#2正则表达式
import re
#查询所有包含d字符的标签
res2=soup.find_all(re.compile('d+'))
print(res2)
#3列表
#查找所有的title标签和a标签
res3=soup.find_all(['title','a'])
print(res3)
#4关键词
#查询属性id='link1'的标签
res4=soup.find_all(id='link1')
print(res4)
#5内容匹配
res5=soup.find_all(text='Tillie') #文本匹配
res55=soup.find_all(text=re.compile('Dormouse'))
print(res55)
#6嵌套选择
print(soup.find_all('p'))
#查看所有p标签下所有的a标签
for i in soup.find_all('p'):
print(i.find_all('a'))
#3.CSS选择器
#1根据标签查询对象
res6=soup.select('a') #返回列表
print(res6) #得到所有的a标签
#2根据ID属性查询标签对象(id用#)
print(soup.select('#link2'))
#3根据class属性查询标签对象(class用.)
print(soup.select('.sister'))
print(soup.select('.sister')[2].get_text()) #获取文本内容
#4属性选择(获取a标签里=href属性值的标签)
print(soup.select('a[href="http://example.com/elsie" rel="external nofollow" rel="external nofollow" ]'))
#5包含选择(获取)
print(soup.select('p a#link1'))
#6并列选择
print(soup.select('a#link1,a#link2'))
#7得到标签内容
res7=soup.select('p a.sister')
for i in res7:
print(i.get_text())
#post Request
import urllib.parse
url='http://www.iqianyue.com/mypost/'
# Build uploaded data
postdata=urllib.parse.urlencode({
'name':'Jack',
'pass':'123456'
}).encode('utf-8') # String to byte stream data
html=urllib.request.urlopen(url,data=postdata).read()
print(html)
0
4. XPath syntax
XPath is a language for finding information in XML documents.
XPath can be used to traverse elements and attributes in XML documents
#post Request
import urllib.parse
url='http://www.iqianyue.com/mypost/'
# Build uploaded data
postdata=urllib.parse.urlencode({
'name':'Jack',
'pass':'123456'
}).encode('utf-8') # String to byte stream data
html=urllib.request.urlopen(url,data=postdata).read()
print(html)
1