python USES tornado to implement a simple crawler

  • 2020-11-20 06:10:27
  • OfStack

This article shares the specific code of python using tornado to implement simple crawler for your reference. The specific content is as follows

The code is in the official documentation of the sample code, but as a novice tornado is a little difficult to read, so I added comments to the code to make it easier to understand, the code is as follows:


# coding=utf-8
 #!/usr/bin/env python

import time
from datetime import timedelta

try:
  from HTMLParser import HTMLParser
  from urlparse import urljoin, urldefrag
except ImportError:
  from html.parser import HTMLParser
  from urllib.parse import urljoin, urldefrag

from tornado import httpclient, gen, ioloop, queues

 #  Set the url to crawl 
base_url = 'http://www.baidu.com'
 #  Set up the worker The number of 
concurrency = 10
 #  This code gets base_url Under all the others url
@gen.coroutine
def get_links_from_url(url):

  try:
    #  By asynchrony url The initiating 
    response = yield httpclient.AsyncHTTPClient().fetch(url)
    print('fetched %s' % url)
    #  If the response is a byte type   decode 
    html = response.body if isinstance(response.body, str) \
      else response.body.decode(errors='ignore')
    #  build url The list of 
    urls = [urljoin(url, remove_fragment(new_url))
        for new_url in get_links(html)]
  except Exception as e:
    print('Exception: %s %s' % (e, url))
    #  Error returns an empty list 
    raise gen.Return([])
  #  return url The list of 
  raise gen.Return(urls)


def remove_fragment(url):
  # Remove the anchor 
  pure_url, frag = urldefrag(url)

  return pure_url


def get_links(html):
  # from html Extract from page url
  class URLSeeker(HTMLParser):
    def __init__(self):
      HTMLParser.__init__(self)
      self.urls = []

    def handle_starttag(self, tag, attrs):
      href = dict(attrs).get('href')
      if href and tag == 'a':
        self.urls.append(href)

  url_seeker = URLSeeker()
  url_seeker.feed(html)
  return url_seeker.urls


@gen.coroutine
def main():
  #  Create a queue 
  q = queues.Queue()
  #  Record the start time stamp 
  start = time.time()
  #  Construct two sets 
  fetching, fetched = set(), set()

  @gen.coroutine
  def fetch_url():
    #  Fetch data from the queue 
    current_url = yield q.get()
    try:
      #  If the fetched data already exists in the queue   return 
      if current_url in fetching:
        return

      print('fetching %s' % current_url)
      #  If none exists to add to the collection 
      fetching.add(current_url)
      #  Continue retrieving links from newly placed links 
      urls = yield get_links_from_url(current_url)
      #  Will have been requested to play url In the first 2 A collection of 
      fetched.add(current_url)

      for new_url in urls:
        # Only follow links beneath the base URL
        #  If the link is passed in url Start by putting it in a queue 
        if new_url.startswith(base_url):
          yield q.put(new_url)

    finally:
      #  Data subtraction in queue 1
      q.task_done()

  @gen.coroutine
  def worker():
    while True:
      #  Keep the program running 
      yield fetch_url()
  #  Will be the first 1 a url In the queue 
  q.put(base_url)

  # Start workers, then wait for the work queue to be empty.
  for _ in range(concurrency):
    #  Start the corresponding number of worker
    worker()
  #  Wait for queue data processing to complete 
  yield q.join(timeout=timedelta(seconds=300))
  #  Throws an exception if two sets are not equal 
  assert fetching == fetched
  #  Print execution time 
  print('Done in %d seconds, fetched %s URLs.' % (
    time.time() - start, len(fetched)))


if __name__ == '__main__':
  io_loop = ioloop.IOLoop.current()
  io_loop.run_sync(main)

Related articles: