A Simple Web Crawler for Learning node. js from Zero (4)

  • 2021-07-22 08:55:12
  • OfStack

Preface

Having introduced some basic knowledge of node. js before, our goal in the following article is to analyze and crawl web pages simply after learning this course, and output and save the crawled information.

The reptile's idea is simple:

Determine the URL to grasp; Crawling URL to obtain web page content; Analyze and store the content; Repeat step 1

In this section, we used two important modules to do reptiles:

request: http is encapsulated to provide more and more convenient interfaces for us to use, while request makes asynchronous requests. More information can be found in this article cheerio: Similar to jQuery, you can use $(), find (), text (), html () and other methods to extract elements and data from a page, but if you compare them carefully, there are not as many methods in cheerio as there are in jQuery.

1. hello world

Say hello world, in fact, the first thing to start is the simplest grabbing. Let's take cnode website as an example (https://cnodejs.org/). The characteristics of this website are:

You don't need to log in to access the home page and other pages Pages are rendered synchronously, and there is no problem with asynchronous requests DOM has clear structure

The code is as follows:


var request = require('request'),
 cheerio = require('cheerio');

request('https://cnodejs.org/', function(err, response, body){
 if( !err && response.statusCode == 200 ){
 // body For source code 
 //  Use  cheerio.load  Converts a string to a  cheerio(jQuery)  Object, 
 //  According to jQuery Mode operation can be done 
 var $ = cheerio.load(body);
 
 //  Output navigation html Code 
 console.log( $('.nav').html() );
 }
});

Such a piece of code to achieve a simple web crawler, crawl to the source code, and then disassemble the source code analysis, for example, we want to get the first page of the first page of the problem title, author, jump link, click number, reply number. With chrome, we can get such a structure:

Each div[.cell] Is a title complete unit, in which one unit is temporarily called $item


{
 title : $item.find('.topic_title').text(),
 url : $item.find('.topic_title').attr('href'),
 author : $item.find('.user_avatar img').attr('title'),
 reply : $item.find('.count_of_replies').text(),
 visits : $item.find('.count_of_visits').text()
}

Therefore, the loop div[.cell] , you can get the information we want:


request('https://cnodejs.org/?_t='+Date.now(), function(err, response, body){
 if( !err && response.statusCode == 200 ){
 var $ = cheerio.load(body);

 var data = [];
 $('#topic_list .cell').each(function(){
  var $this = $(this);
 
 //  Use trim Remove the spaces at both ends of the data 
  data.push({
  title : trim($this.find('.topic_title').text()),
  url : trim($this.find('.topic_title').attr('href')),
  author : trim($this.find('.user_avatar img').attr('title')),
  reply : trim($this.find('.count_of_replies').text()),
  visits : trim($this.find('.count_of_visits').text())
  })
 });
 // console.log( JSON.stringify(data, ' ', 4) );
 console.log(data);
 }
});

//  Delete the spaces on the left and right ends of the string 
function trim(str){ 
 return str.replace(/(^\s*)|(\s*$)/g, "");
}

2. Crawl multiple pages

Above, we only crawled one page. How can we crawl multiple pages in one program? Take CNode website as an example. Just now, we just crawled the data on page 1. Here we want to request the data on the first 6 pages (don't grab too many pages at the same time, it will be blocked by IP). The structure of each page is 1, so we only need to modify the url address.

2.1 Crawling multiple pages at the same time

First, the request request is encapsulated as a method, which is convenient for calling. If you still use console.log Method, it will output all 6 pages of data to the console, which seems very inconvenient. Here we use the file operation content in the previous section, introduce the fs module, write the obtained content into the file, and then put the newly created file into the file directory (you need to manually create the file directory):


//  Put page Pass it in as a parameter, and then call request Crawl 
function getData(page){
 var url = 'https://cnodejs.org/?tab=all&page='+page;
 console.time(url);
 request(url, function(err, response, body){
 if( !err && response.statusCode == 200 ){
  console.timeEnd(url); //  Pass time And timeEnd Record capture url Time of 

  var $ = cheerio.load(body);

  var data = [];
  $('#topic_list .cell').each(function(){
  var $this = $(this);

  data.push({
   title : trim($this.find('.topic_title').text()),
   url : trim($this.find('.topic_title').attr('href')),
   author : trim($this.find('.user_avatar img').attr('title')),
   reply : trim($this.find('.count_of_replies').text()),
   visits : trim($this.find('.count_of_visits').text())
  })
  });
  // console.log( JSON.stringify(data, ' ', 4) );
  // console.log(data);
  var filename = './file/cnode_'+page+'.txt';
  fs.writeFile(filename, JSON.stringify(data, ' ', 4), function(){
  console.log( filename + '  Write successful ' );
  })
 }
 });
}

Link to CNode paging request: https://cnodejs. org/? tab = all & page = 2, we just need to modify the value of page:


var max = 6;
for(var i=1; i<=max; i++){

 getData(i);
}

In this way, the first 6 pages of data can be requested at the same time. After the file is executed, the time consumed when each link is successfully crawled will be output. After the successful crawling, the relevant information will be written into the file:


$ node test.js
 Start a request ...
https://cnodejs.org/?tab=all&page=1: 279ms
./file/cnode_1.txt  Write successful 
https://cnodejs.org/?tab=all&page=3: 372ms
./file/cnode_3.txt  Write successful 
https://cnodejs.org/?tab=all&page=2: 489ms
./file/cnode_2.txt  Write successful 
https://cnodejs.org/?tab=all&page=4: 601ms
./file/cnode_4.txt  Write successful 
https://cnodejs.org/?tab=all&page=5: 715ms
./file/cnode_5.txt  Write successful 
https://cnodejs.org/?tab=all&page=6: 819ms
./file/cnode_6.txt  Write successful 

We can see the output of 6 files in file directory.

2.2 Control the number of simultaneous requests

After we use the for loop, we will initiate all requests at the same time. If we request 100, 200 and 500 pages at the same time, we will initiate a large number of requests to the server in a short time, and finally we will be blocked with IP. Here, I wrote a scheduling method, which can only initiate 5 requests at the same time. After the last request is completed, I will take out one request from the queue.


/*
 @param data []  A collection of requested links is required 
 @param max num  Maximum number of simultaneous requests 
*/
function Dispatch(data, max){
 var _max = max || 5, //  Maximum number of requests 
 _dataObj = data || [], //  Requested url Set 
 _cur = 0, //  Number of current requests 
 _num = _dataObj.length || 0,
 _isEnd = false,
 _callback;

 var ss = function(){
 var s = _max - _cur;
 while(s--){
  if( !_dataObj.length ){
  _isEnd = true;
  break;
  }
  var surl = _dataObj.shift();
  _cur++;

  _callback(surl);
 }
 }

 this.start = function(callback){
 _callback = callback;

 ss();
 },

 this.call = function(){
 if( !_isEnd ){
  _cur--;
  ss();
 }
 }
}

var dis = new Dispatch(urls, max);
dis.start(getData);

Then, in getData, after writing the file, make a callback call to dis:


var filename = './file/cnode_'+page+'.txt';
fs.writeFile(filename, JSON.stringify(data, ' ', 4), function(){
 console.log( filename + '  Write successful ' );
})
dis.call();

In this way, the number of simultaneous requests is controlled during asynchronous invocation.

3. Crawl the page you need to log in to

For example, when we crawl some websites such as CNode and Baidu Post Bar, we can crawl them directly without logging in, so websites such as Zhihu must log in before crawling, otherwise we will jump directly to the login page. How can we grasp this situation?

Use cookie. After the user logs in, they will record 1 information of the user in cookie. We are grabbing 1 page and bringing these cookie, and the server will think that we are in the login state, and the program can grab the information we want.

Log in to our account on the browser first, and then use it in console document.domain Get all the strings of cookie and copy them to cookie in the program below (if you know which cookie is not needed, you can eliminate it).


request({
 url:'https://www.zhihu.com/explore',
 headers:{
 // "Referer":"www.zhihu.com"
 cookie : xxx
 }
}, function(error, response, body){
 if (!error && response.statusCode == 200) {
 // console.log( body );
 var $ = cheerio.load(body);

 
 }
})

At the same time, in request, you can also set referer, such as some interfaces or other data, and set the restrictions of referer, which must be accessed under a certain domain name. Then in request, you can set referer for forgery.

4. Save the captured picture

The text content in the page can be refined and saved to the text or database, so how to save the picture locally?

Images can be output to a file stream using the pipe method in request, and then used fs.createWriteStream Output as a picture.

Here, we save the pictures to the directory created by date, and mkdirp can create multi-level directories once (./img/2017/01/22). The saved picture name can be named with the original name or according to its own rules.


var request = require('request'),
 cheerio = require('cheerio'),
 fs = require('fs'),
 path = require('path'), //  The name or suffix used to parse the picture 
 mkdirp = require('mkdirp'); //  Used to create multilevel directories 

var date = new Date(),
 year = date.getFullYear(),
 month = date.getMonth()+1,
 month = ('00'+month).slice(-2), //  Add preamble 0
 day = date.getDate(),
 day = ('00'+day).slice(-2), //  Add preamble 0
 dir = './img/'+year+'/'+month+'/'+day+'/';

//  Create a directory based on a date  ./img/2017/01/22/
var stats = fs.statSync(dir);
if( stats.isDirectory() ){
 console.log(dir+'  Already exists ');
}else{
 console.log(' Creating directory  '+dir);
 mkdirp(dir, function(err){
 if(err) throw err;
 })
}

request({
 url : 'http://desk.zol.com.cn/meinv/?_t='+Date.now()
}, function(err, response, body){
 if(err) throw err;

 if( response.statusCode == 200 ){
 var $ = cheerio.load(body);
 
 $('.photo-list-padding img').each(function(){
  var $this = $(this),
  imgurl = $this.attr('src');
  
  var ext = path.extname(imgurl); //  Gets the suffix name of the picture, such as  .jpg, .png .gif Etc 
  var filename = Date.now()+'_'+ parseInt(Math.random()*10000)+ext; //  Naming method: millisecond timestamp + Random number + Suffix name 
  // var filename = path.basename(imgurl); //  Get the original name of the picture directly 
  // console.log(filename);
  download(imgurl, dir+filename); //  Start downloading pictures 
 })
 }
});

//  Save Picture 
var download = function(imgurl, filename){
 request.head(imgurl, function(err, res, body) {
 request(imgurl).pipe(fs.createWriteStream(filename));
 console.log(filename+' success!');
 });
}

You can see the downloaded picture in the corresponding date directory (e.g../img/2017/01/22/).

Summarize

We just wrote a simple crawler here, for more complex functions, it needs more complex algorithms to control it. There is also how to capture the data of ajax, which we will explain later. The above is the whole content of this article, I hope that the content of this article can bring 1 certain help to your study or work. This site will continue to share articles about node getting started. Interested friends please continue to pay attention to this site.


Related articles: