Use NodeJS and PhantomJS to grab web page information and website screenshots

  • 2020-03-29 23:48:44
  • OfStack

First, go to the PhantomJS official website to download the version of the corresponding platform, or download the source code to compile. Then configure PhantomJS into an environment variable and type

$phantomjs

If there is a reaction, then we can proceed to the next step.

Take a simple screenshot with PhantomJS

 var webpage = require('webpage') , page = webpage.create(); page.viewportSize = { width: 1024, height: 800 }; page.clipRect = { top: 0, left: 0, width: 1024, height: 800 }; page.settings = { javascriptEnabled: false, loadImages: true, userAgent: 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/19.0' }; page.open('http://www.baidu.com', function (status) { var data; if (status === 'fail') { console.log('open page fail!'); } else { page.render('./snapshot/test.png'); } // release the memory page.close(); }); 

Here we set the window size to 1024 * 800:

page.viewportSize = { width: 1024, height: 800 };

Intercept 1024 * 800 image starting from (0, 0) :

page.clipRect = { top: 0, left: 0, width: 1024, height: 800 };

Disable Javascript, allow images to load, and change userAgent to "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/19.0" :

 page.settings = { javascriptEnabled: false, loadImages: true, userAgent: 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/19.0'}; 

Then use page.open to open the page, and finally output the screenshot to./snapshot/test.png:

 page.render('./snapshot/test.png') ;

 

NodeJS communicates with PhantomJS

Let's first see what PhantomJS can do with communications.

Command line pass-through
 
 Such as: 
phantomjs snapshot.js http://www.baidu.com
 Command line pass-through parameter can only be in PhantomJS Open when the pass-through, in the process of running powerless. 
The standard output
 
 Standard output can be obtained from PhantomJS to NodeJS Output data, but not from NodeJS To send data to PhantomJS . 
 In tests, however, standard output was the fastest of these and should be considered in large data transfers. 
  The HTTP
 
PhantomJS to NodeJS Service issue HTTP Request, and then NodeJS Returns the corresponding data. 
 This approach is simple, but requests can only be made by PhantomJS Send out. 
Websocket
 
 It's worth noting that PhantomJS 1.9.0 support Websocket Yes, but unfortunately it is hixie-76 Websocket But it does provide one NodeJS The initiative to PhantomJS The communication plan. 
 In the test, we found that PhantomJS Connect local Websocket Service needs 1 Second or so, let's not think about this method for a moment. 
Phantomjs -node
 
phantomjs-node Success will be PhantomJS As a NodeJS A module to use, but let's take a look at the author's principle explanation: 
I will answer that question with a question. How do you communicate with a process that doesn't support shared memory, sockets, FIFOs, or standard input?
Well, there's one thing PhantomJS does support, and that's opening webpages. In fact, it's really good at opening web pages. So we communicate with PhantomJS by spinning up an instance of ExpressJS, opening Phantom in a subprocess, and pointing it at a special webpage that turns socket.io messages into alert()calls. Those alert() calls are picked up by Phantom and there you go!
The communication itself happens via James Halliday's fantastic dnode library, which fortunately works well enough when combined with browserify to run straight out of PhantomJS's pidgin Javascript environment.
 In fact phantomjs-node Also used HTTP or Websocket To communicate, but it depends on big, we just want to do a simple thing, let's not think about this thing for the moment. 

 

The design

< img Alt = "" border = 0 SRC =" / / files.jb51.net/file_images/article/201311/201311180934025.jpg ">

 

Let's get started
We implemented it using HTTP in the first release.

Firstly, the cluster is used for simple process daemon (index.js) :


module.exports = (function () {
  "use strict"
  var cluster = require('cluster')
    , fs = require('fs');
  if(!fs.existsSync('./snapshot')) {
    fs.mkdirSync('./snapshot');
  }
  if (cluster.isMaster) {
    cluster.fork();
    cluster.on('exit', function (worker) {
      console.log('Worker' + worker.id + ' died :(');
      process.nextTick(function () {
        cluster.fork();
      });
    })
  } else {
    require('./extract.js');
  }
})();

Then use connect to do our external API (extract. Js) :


module.exports = (function () {
  "use strict"
  var connect = require('connect')
    , fs = require('fs')
    , spawn = require('child_process').spawn
    , jobMan = require('./lib/jobMan.js')
    , bridge = require('./lib/bridge.js')
    , pkg = JSON.parse(fs.readFileSync('./package.json'));
  var app = connect()
    .use(connect.logger('dev'))
    .use('/snapshot', connect.static(__dirname + '/snapshot', { maxAge: pkg.maxAge }))
    .use(connect.bodyParser())
    .use('/bridge', bridge)
    .use('/api', function (req, res, next) {
      if (req.method !== "POST" || !req.body.campaignId) return next();
      if (!req.body.urls || !req.body.urls.length) return jobMan.watch(req.body.campaignId, req, res, next);
      var campaignId = req.body.campaignId
        , imagesPath = './snapshot/' + campaignId + '/'
        , urls = []
        , url
        , imagePath;
      function _deal(id, url, imagePath) {
        // just push into urls list
        urls.push({
          id: id,
          url: url,
          imagePath: imagePath
        });
      }

      for (var i = req.body.urls.length; i--;) {
        url = req.body.urls[i];
        imagePath = imagesPath + i + '.png';
        _deal(i, url, imagePath);
      }
      jobMan.register(campaignId, urls, req, res, next);
      var snapshot = spawn('phantomjs', ['snapshot.js', campaignId]);
      snapshot.stdout.on('data', function (data) {
        console.log('stdout: ' + data);
      });
      snapshot.stderr.on('data', function (data) {
        console.log('stderr: ' + data);
      });
      snapshot.on('close', function (code) {
        console.log('snapshot exited with code ' + code);
      });
    })
    .use(connect.static(__dirname + '/html', { maxAge: pkg.maxAge }))
    .listen(pkg.port, function () { console.log('listen: ' + 'http://localhost:' + pkg.port); });
})();

Here we refer to two modules, bridge and jobMan.

The bridge is the HTTP communication bridge and the jobMan is the job manager. We through campaignId to corresponding to a job and then entrust the job and the response jobMan management. Then launch PhantomJS for processing.

The communication bridge is responsible for receiving or returning job related information to jobMan (bridge.js) :


module.exports = (function () {
  "use strict"
  var jobMan = require('./jobMan.js')
    , fs = require('fs')
    , pkg = JSON.parse(fs.readFileSync('./package.json'));
  return function (req, res, next) {
      if (req.headers.secret !== pkg.secret) return next();
      // Snapshot APP can post url information
      if (req.method === "POST") {
        var body = JSON.parse(JSON.stringify(req.body));
        jobMan.fire(body);
        res.end('');
      // Snapshot APP can get the urls should extract
      } else {
        var urls = jobMan.getUrls(req.url.match(/campaignId=([^&]*)(s|&|$)/)[1]);
        res.writeHead(200, {'Content-Type': 'application/json'});
        res.statuCode = 200;
        res.end(JSON.stringify({ urls: urls }));
      }
  };
})();

If the request method is POST, we think PhantomJS is pushing us information about the job. When it is GET, it is considered to GET information about the job.

JobMan is responsible for managing the job and sending the current job information back to the client (jobman.js) through response:


module.exports = (function () {
  "use strict"
  var fs = require('fs')
    , fetch = require('./fetch.js')
    , _jobs = {};
  function _send(campaignId){
    var job = _jobs[campaignId];
    if (!job) return;
    if (job.waiting) {
      job.waiting = false;
      clearTimeout(job.timeout);
      var finished = (job.urlsNum === job.finishNum)
        , data = {
        campaignId: campaignId,
        urls: job.urls,
        finished: finished
      };
      job.urls = [];
      var res = job.res;
      if (finished) {
        _jobs[campaignId] = null;
        delete _jobs[campaignId]
      }
      res.writeHead(200, {'Content-Type': 'application/json'});
      res.statuCode = 200;
      res.end(JSON.stringify(data));
    }
  }

  function register(campaignId, urls, req, res, next) {
    _jobs[campaignId] = {
      urlsNum: urls.length,
      finishNum: 0,
      urls: [],
      cacheUrls: urls,
      res: null,
      waiting: false,
      timeout: null
    };
    watch(campaignId, req, res, next);
  }
  function watch(campaignId, req, res, next) {
    _jobs[campaignId].res = res;
    // 20s timeout
    _jobs[campaignId].timeout = setTimeout(function () {
      _send(campaignId);
    }, 20000);
  }
  function fire(opts) {
    var campaignId = opts.campaignId
      , job = _jobs[campaignId]
      , fetchObj = fetch(opts.html);
    if (job) {
      if (+opts.status && fetchObj.title) {
        job.urls.push({
          id: opts.id,
          url: opts.url,
          image: opts.image,
          title: fetchObj.title,
          description: fetchObj.description,
          status: +opts.status
        });
      } else {
        job.urls.push({
          id: opts.id,
          url: opts.url,
          status: +opts.status
        });
      }
      if (!job.waiting) {
        job.waiting = true;
        setTimeout(function () {
          _send(campaignId);
        }, 500);
      }
      job.finishNum ++;
    } else {
      console.log('job can not found!');
    }
  }
  function getUrls(campaignId) {
    var job = _jobs[campaignId];
    if (job) return job.cacheUrls;
  }
  return {
    register: register,
    watch: watch,
    fire: fire,
    getUrls: getUrls
  };
})();

Here we use the fetch to fetch the title and description of HTML. The fetch implementation is relatively simple (fetle.js) :


module.exports = (function () {
  "use strict"
  return function (html) {
    if (!html) return { title: false, description: false };
    var title = html.match(/<title>(.*?)</title>/)
      , meta = html.match(/<metas(.*?)/?>/g)
      , description;
    if (meta) {
      for (var i = meta.length; i--;) {
        if(meta[i].indexOf('name="description"') > -1 || meta[i].indexOf('name="Description"') > -1){
          description = meta[i].match(/content="(.*?)"/)[1];
        }
      }
    }
    (title && title[1] !== '') ? (title = title[1]) : (title = 'No Title');
    description || (description = 'No Description');
    return {
      title: title,
      description: description
    };
  };
})();

Finally, the source code for the PhantomJS run, which gets the job information from the bridge via HTTP after it's started, and then one of the urls of each job is returned to the bridge (snapshot. Js) via HTTP:


var webpage = require('webpage')
  , args = require('system').args
  , fs = require('fs')
  , campaignId = args[1]
  , pkg = JSON.parse(fs.read('./package.json'));
function snapshot(id, url, imagePath) {
  var page = webpage.create()
    , send
    , begin
    , save
    , end;
  page.viewportSize = { width: 1024, height: 800 };
  page.clipRect = { top: 0, left: 0, width: 1024, height: 800 };
  page.settings = {
    javascriptEnabled: false,
    loadImages: true,
    userAgent: 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/1.9.0'
  };
  page.open(url, function (status) {
    var data;
    if (status === 'fail') {
      data = [
        'campaignId=',
        campaignId,
        '&url=',
        encodeURIComponent(url),
        '&id=',
        id,
        '&status=',
      ].join('');
      postPage.open('http://localhost:' + pkg.port + '/bridge', 'POST', data, function () {});
    } else { 
      page.render(imagePath);
      var html = page.content;
      // callback NodeJS
      data = [
        'campaignId=',
        campaignId,
        '&html=',
        encodeURIComponent(html),
        '&url=',
        encodeURIComponent(url),
        '&image=',
        encodeURIComponent(imagePath),
        '&id=',
        id,
        '&status=',
      ].join('');
      postMan.post(data);
    }
    // release the memory
    page.close();
  });
}
var postMan = {
  postPage: null,
  posting: false,
  datas: [],
  len: 0,
  currentNum: 0,
  init: function (snapshot) {
    var postPage = webpage.create();
    postPage.customHeaders = {
      'secret': pkg.secret
    };
    postPage.open('http://localhost:' + pkg.port + '/bridge?campaignId=' + campaignId, function () {
      var urls = JSON.parse(postPage.plainText).urls
        , url;
      this.len = urls.length;
      if (this.len) {
        for (var i = this.len; i--;) {
          url = urls[i];
          snapshot(url.id, url.url, url.imagePath);
        }
      }
    });
    this.postPage = postPage;
  },
  post: function (data) {
    this.datas.push(data);
    if (!this.posting) {
      this.posting = true;
      this.fire();
    }
  },
  fire: function () {
    if (this.datas.length) {
      var data = this.datas.shift()
        , that = this;
      this.postPage.open('http://localhost:' + pkg.port + '/bridge', 'POST', data, function () {
        that.fire();
        // kill child process
        setTimeout(function () {
          if (++this.currentNum === this.len) {
            that.postPage.close();
            phantom.exit();
          }
        }, 500);
      });
    } else {
      this.posting = false;
    }
  }
};
postMan.init(snapshot);

The effect

< img Alt = "" border = 0 SRC =" / / files.jb51.net/file_images/article/201311/201311180934026.jpg ">

 


Related articles: