python USES tornado to implement a simple crawler
- 2020-11-20 06:10:27
- OfStack
This article shares the specific code of python using tornado to implement simple crawler for your reference. The specific content is as follows
The code is in the official documentation of the sample code, but as a novice tornado is a little difficult to read, so I added comments to the code to make it easier to understand, the code is as follows:
# coding=utf-8
#!/usr/bin/env python
import time
from datetime import timedelta
try:
from HTMLParser import HTMLParser
from urlparse import urljoin, urldefrag
except ImportError:
from html.parser import HTMLParser
from urllib.parse import urljoin, urldefrag
from tornado import httpclient, gen, ioloop, queues
# Set the url to crawl
base_url = 'http://www.baidu.com'
# Set up the worker The number of
concurrency = 10
# This code gets base_url Under all the others url
@gen.coroutine
def get_links_from_url(url):
try:
# By asynchrony url The initiating
response = yield httpclient.AsyncHTTPClient().fetch(url)
print('fetched %s' % url)
# If the response is a byte type decode
html = response.body if isinstance(response.body, str) \
else response.body.decode(errors='ignore')
# build url The list of
urls = [urljoin(url, remove_fragment(new_url))
for new_url in get_links(html)]
except Exception as e:
print('Exception: %s %s' % (e, url))
# Error returns an empty list
raise gen.Return([])
# return url The list of
raise gen.Return(urls)
def remove_fragment(url):
# Remove the anchor
pure_url, frag = urldefrag(url)
return pure_url
def get_links(html):
# from html Extract from page url
class URLSeeker(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
href = dict(attrs).get('href')
if href and tag == 'a':
self.urls.append(href)
url_seeker = URLSeeker()
url_seeker.feed(html)
return url_seeker.urls
@gen.coroutine
def main():
# Create a queue
q = queues.Queue()
# Record the start time stamp
start = time.time()
# Construct two sets
fetching, fetched = set(), set()
@gen.coroutine
def fetch_url():
# Fetch data from the queue
current_url = yield q.get()
try:
# If the fetched data already exists in the queue return
if current_url in fetching:
return
print('fetching %s' % current_url)
# If none exists to add to the collection
fetching.add(current_url)
# Continue retrieving links from newly placed links
urls = yield get_links_from_url(current_url)
# Will have been requested to play url In the first 2 A collection of
fetched.add(current_url)
for new_url in urls:
# Only follow links beneath the base URL
# If the link is passed in url Start by putting it in a queue
if new_url.startswith(base_url):
yield q.put(new_url)
finally:
# Data subtraction in queue 1
q.task_done()
@gen.coroutine
def worker():
while True:
# Keep the program running
yield fetch_url()
# Will be the first 1 a url In the queue
q.put(base_url)
# Start workers, then wait for the work queue to be empty.
for _ in range(concurrency):
# Start the corresponding number of worker
worker()
# Wait for queue data processing to complete
yield q.join(timeout=timedelta(seconds=300))
# Throws an exception if two sets are not equal
assert fetching == fetched
# Print execution time
print('Done in %d seconds, fetched %s URLs.' % (
time.time() - start, len(fetched)))
if __name__ == '__main__':
io_loop = ioloop.IOLoop.current()
io_loop.run_sync(main)