Python multithreaded crawler actual combat _ climb to take the example of encyclopedia joke

  • 2020-06-19 10:34:51
  • OfStack

Multithreaded crawler: that is, some segments of a program execute in parallel,

Reasonable setting of multithreading can make the crawler more efficient

Common crawler and multithreaded crawler

The analysis of this website link results in:

https: / / www. qiushibaike. com / 8 hr/page/page /

Multithreaded crawler is similar to JAVA's multithreaded, directly on the code


'''
# The code here is for a common crawler 
import urllib.request
import urllib.error
import re
headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
for i in range(1,2):
 url = "https://www.qiushibaike.com/8hr/page/"+str(i)+"/"
 pagedata = urllib.request.urlopen(url).read().decode("utf-8","ignore")
 pattern = '<div class="content">.*?<span>(.*?)</span>(.*?)</div>'
 datalist = re.compile(pattern,re.S).findall(pagedata)
 for j in range(0,len(datalist)):
  print(" The first "+str(i)+" The first page "+str(j)+" The message was: ")
  print(datalist[j])
'''
'''
# Multithreading is introduced here 
import threading # Import multithreaded package 
class A(threading.Thread): # create 1 A multithreaded A
 def __init__(self):  # Of the two methods that must be included 1: Initializing thread 
  threading.Thread.__init__(self)
 def run(self):   # Of the two methods that must be included 1 : Thread running method 
  for i in range(0,11):
   print(" I am a thread A")
class B(threading.Thread): # create 1 A multithreaded A
 def __init__(self):  # Of the two methods that must be included 1: Initializing thread 
  threading.Thread.__init__(self)
 def run(self):   # Of the two methods that must be included 1 : Thread running method 
  for i in range(0,11):
   print(" I am a thread B")
t1 = A() # Thread instantiation 
t1.start() # The thread running 
t2 = B()
t2.start()
'''

# Here is the modified multithreaded crawler 
# Use multiple threads to crawl odd and even pages 
import urllib.request
import urllib.error
import re
import threading
headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
class one(threading.Thread): # Crawl odd pages 
 def __init__(self):
  threading.Thread.__init__(self)
 def run(self):
  for i in range(1,12,2):
   url = "https://www.qiushibaike.com/8hr/page/"+str(i)+"/"
   pagedata = urllib.request.urlopen(url).read().decode("utf-8","ignore")
   pattern = '<div class="content">.*?<span>(.*?)</span>(.*?)</div>'
   datalist = re.compile(pattern,re.S).findall(pagedata)
   for j in range(0,len(datalist)):
    print(" The first "+str(i)+" The first page "+str(j)+" The content is as follows: ")
    print(datalist[j])

class two(threading.Thread): # Crawl odd pages 
 def __init__(self):
  threading.Thread.__init__(self)
 def run(self):
  for i in range(2,12,2):
   url = "https://www.qiushibaike.com/8hr/page/"+str(i)+"/"
   pagedata = urllib.request.urlopen(url).read().decode("utf-8","ignore")
   pattern = '<div class="content">.*?<span>(.*?)</span>(.*?)</div>'
   datalist = re.compile(pattern,re.S).findall(pagedata)
   for j in range(0,len(datalist)):
    print(" The first "+str(i)+" The first page "+str(j)+" The content is as follows: ")
    print(datalist[j])
t1 = one()
t2 = two()
t1.start()
t2.start()

Related articles: