Python multithreaded crawler actual combat _ climb to take the example of encyclopedia joke
- 2020-06-19 10:34:51
- OfStack
Multithreaded crawler: that is, some segments of a program execute in parallel,
Reasonable setting of multithreading can make the crawler more efficient
Common crawler and multithreaded crawler
The analysis of this website link results in:
https: / / www. qiushibaike. com / 8 hr/page/page /
Multithreaded crawler is similar to JAVA's multithreaded, directly on the code
'''
# The code here is for a common crawler
import urllib.request
import urllib.error
import re
headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
for i in range(1,2):
url = "https://www.qiushibaike.com/8hr/page/"+str(i)+"/"
pagedata = urllib.request.urlopen(url).read().decode("utf-8","ignore")
pattern = '<div class="content">.*?<span>(.*?)</span>(.*?)</div>'
datalist = re.compile(pattern,re.S).findall(pagedata)
for j in range(0,len(datalist)):
print(" The first "+str(i)+" The first page "+str(j)+" The message was: ")
print(datalist[j])
'''
'''
# Multithreading is introduced here
import threading # Import multithreaded package
class A(threading.Thread): # create 1 A multithreaded A
def __init__(self): # Of the two methods that must be included 1: Initializing thread
threading.Thread.__init__(self)
def run(self): # Of the two methods that must be included 1 : Thread running method
for i in range(0,11):
print(" I am a thread A")
class B(threading.Thread): # create 1 A multithreaded A
def __init__(self): # Of the two methods that must be included 1: Initializing thread
threading.Thread.__init__(self)
def run(self): # Of the two methods that must be included 1 : Thread running method
for i in range(0,11):
print(" I am a thread B")
t1 = A() # Thread instantiation
t1.start() # The thread running
t2 = B()
t2.start()
'''
# Here is the modified multithreaded crawler
# Use multiple threads to crawl odd and even pages
import urllib.request
import urllib.error
import re
import threading
headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
class one(threading.Thread): # Crawl odd pages
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for i in range(1,12,2):
url = "https://www.qiushibaike.com/8hr/page/"+str(i)+"/"
pagedata = urllib.request.urlopen(url).read().decode("utf-8","ignore")
pattern = '<div class="content">.*?<span>(.*?)</span>(.*?)</div>'
datalist = re.compile(pattern,re.S).findall(pagedata)
for j in range(0,len(datalist)):
print(" The first "+str(i)+" The first page "+str(j)+" The content is as follows: ")
print(datalist[j])
class two(threading.Thread): # Crawl odd pages
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for i in range(2,12,2):
url = "https://www.qiushibaike.com/8hr/page/"+str(i)+"/"
pagedata = urllib.request.urlopen(url).read().decode("utf-8","ignore")
pattern = '<div class="content">.*?<span>(.*?)</span>(.*?)</div>'
datalist = re.compile(pattern,re.S).findall(pagedata)
for j in range(0,len(datalist)):
print(" The first "+str(i)+" The first page "+str(j)+" The content is as follows: ")
print(datalist[j])
t1 = one()
t2 = two()
t1.start()
t2.start()