Half an hour to achieve Java hand rolled web crawler framework of with complete source code

  • 2021-10-15 10:35:10
  • OfStack

Recently doing a search related project, need to crawl the network of a few links stored in the index library, although there are a lot of strong open source crawler framework, but in the attitude of learning, I wrote a simple web crawler, in order to understand the principle. Today, let's share this simple crawler for our friends! !

First, introduce the functions of each class:

The function of DownloadPage. java is to download the page source code for this hyperlink. The function of FunctionUtils. java is to provide different static methods, including: page link regular expression matching, obtaining URL linked elements, judging whether to create files, obtaining Url of pages and converting it into standard Url, intercepting the target content of web page source files. The function of HrefOfPage. java is to get hyperlinks to the source code of the page. The function of UrlDataHanding. java is to integrate various classes and realize url to obtain data to data processing classes. Url queue is not accessed by UrlQueue. java. VisitedUrlQueue. java has accessed the URL queue.

Here's the source code for each class under 1:

DownloadPage. java This class uses the HttpClient component.


package com.sreach.spider;

import java.io.IOException;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;

/**
 * @author binghe
 */
public class DownloadPage {

 /**
  *  According to URL Crawling Web Content 
  * 
  * @param url
  * @return
  */
 public static String getContentFormUrl(String url) {
  /*  Instantiation 1 A HttpClient Client  */
  HttpClient client = new DefaultHttpClient();
  HttpGet getHttp = new HttpGet(url);

  String content = null;

  HttpResponse response;
  try {
   /*  Obtain information carrier  */
   response = client.execute(getHttp);
   HttpEntity entity = response.getEntity();

   VisitedUrlQueue.addElem(url);

   if (entity != null) {
    /*  Convert to text information  */
    content = EntityUtils.toString(entity);

    /*  Determine whether the conditions for downloading web page source code to the local area are met  */
    if (FunctionUtils.isCreateFile(url)
      && FunctionUtils.isHasGoalContent(content) != -1) {
     FunctionUtils.createFile(
       FunctionUtils.getGoalContent(content), url);
    }
   }

  } catch (ClientProtocolException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  } finally {
   client.getConnectionManager().shutdown();
  }

  return content;
 }

}

FunctionUtils. java Methods of this class are static methods


package com.sreach.spider;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * @author binghe
 */
public class FunctionUtils {

 /**
  *  Regular expressions that match hyperlinks 
  */
 private static String pat = "http://www\\.oschina\\.net/code/explore/.*/\\w+\\.[a-zA-Z]+";
 private static Pattern pattern = Pattern.compile(pat);

 private static BufferedWriter writer = null;

 /**
  *  Depth of crawler search 
  */
 public static int depth = 0;

 /**
  *  With "/" To divide URL, Get the element of the hyperlink 
  * 
  * @param url
  * @return
  */
 public static String[] divUrl(String url) {
  return url.split("/");
 }

 /**
  *  Determine whether to create a file 
  * 
  * @param url
  * @return
  */
 public static boolean isCreateFile(String url) {
  Matcher matcher = pattern.matcher(url);

  return matcher.matches();
 }

 /**
  *  Create corresponding file 
  * 
  * @param content
  * @param urlPath
  */
 public static void createFile(String content, String urlPath) {
  /*  Segmentation url */
  String[] elems = divUrl(urlPath);
  StringBuffer path = new StringBuffer();

  File file = null;
  for (int i = 1; i < elems.length; i++) {
   if (i != elems.length - 1) {

    path.append(elems[i]);
    path.append(File.separator);
    file = new File("D:" + File.separator + path.toString());

   }

   if (i == elems.length - 1) {
    Pattern pattern = Pattern.compile("\\w+\\.[a-zA-Z]+");
    Matcher matcher = pattern.matcher(elems[i]);
    if ((matcher.matches())) {
     if (!file.exists()) {
      file.mkdirs();
     }
     String[] fileName = elems[i].split("\\.");
     file = new File("D:" + File.separator + path.toString()
       + File.separator + fileName[0] + ".txt");
     try {
      file.createNewFile();
      writer = new BufferedWriter(new OutputStreamWriter(
        new FileOutputStream(file)));
      writer.write(content);
      writer.flush();
      writer.close();
      System.out.println(" File created successfully ");
     } catch (IOException e) {
      e.printStackTrace();
     }

    }
   }

  }
 }

 /**
  *  Gets a hyperlink to the page and converts it to a formal A Label 
  * 
  * @param href
  * @return
  */
 public static String getHrefOfInOut(String href) {
  /*  Internal and external links are finally transformed into a complete link format  */
  String resultHref = null;

  /*  Determine whether it is an external link  */
  if (href.startsWith("http://")) {
   resultHref = href;
  } else {
   /*  If it is an internal link , Complete link address is added , Other formats are ignored and not handled , Such as: a href="#" rel="external nofollow"  */
   if (href.startsWith("/")) {
    resultHref = "http://www.oschina.net" + href;
   }
  }

  return resultHref;
 }

 /**
  *  Intercept the target content of the Web page source file 
  * 
  * @param content
  * @return
  */
 public static String getGoalContent(String content) {
  int sign = content.indexOf("<pre class=\"");
  String signContent = content.substring(sign);

  int start = signContent.indexOf(">");
  int end = signContent.indexOf("</pre>");

  return signContent.substring(start + 1, end);
 }

 /**
  *  Check whether there is a destination file in the source file of the web page 
  * 
  * @param content
  * @return
  */
 public static int isHasGoalContent(String content) {
  return content.indexOf("<pre class=\"");
 }
}

HrefOfPage. java This class is a hyperlink to get a page


package com.sreach.spider;
/**
 * @author binghe
 *
 */
public class HrefOfPage {
 /**
  *  Get hyperlinks in the page source code 
  */
 public static void getHrefOfContent(String content) {
  System.out.println(" Begin ");
  String[] contents = content.split("<a href=\"");
  for (int i = 1; i < contents.length; i++) {
   int endHref = contents[i].indexOf("\"");

   String aHref = FunctionUtils.getHrefOfInOut(contents[i].substring(
     0, endHref));

   if (aHref != null) {
    String href = FunctionUtils.getHrefOfInOut(aHref);

    if (!UrlQueue.isContains(href)
      && href.indexOf("/code/explore") != -1
      && !VisitedUrlQueue.isContains(href)) {
     UrlQueue.addElem(href);
    }
   }
  }

  System.out.println(UrlQueue.size() + "-- Number of connections captured ");
  System.out.println(VisitedUrlQueue.size() + "-- Number of pages processed ");
 }
}

UrlDataHanding. java This class mainly obtains url from the unaccessed queue, downloads the page, analyzes url, saves the accessed url and so on, realizes the Runnable interface


package com.sreach.spider;
/**
 * @author binghe
 *
 */
public class UrlDataHanding implements Runnable {
 /**
  *  Download the corresponding page and analyze the corresponding page URL In an unaccessed queue. 
  * 
  * @param url
  */
 public void dataHanding(String url) {
  HrefOfPage.getHrefOfContent(DownloadPage.getContentFormUrl(url));
 }

 public void run() {
  while (!UrlQueue.isEmpty()) {
   dataHanding(UrlQueue.outElem());
  }
 }
}

UrlQueue. java This class is mainly used to store unaccessed URL queues


package com.sreach.spider;

import java.util.LinkedList;
/**
 * @author binghe
 *
 */
public class UrlQueue {
 /**  Hyperlink queue  */
 public static LinkedList<String> urlQueue = new LinkedList<String>();

 /**  Maximum number of hyperlinks in the queue  */
 public static final int MAX_SIZE = 10000;

 public synchronized static void addElem(String url) {
  urlQueue.add(url);
 }

 public synchronized static String outElem() {
  return urlQueue.removeFirst();
 }

 public synchronized static boolean isEmpty() {
  return urlQueue.isEmpty();
 }

 public static int size() {
  return urlQueue.size();
 }

 public static boolean isContains(String url) {
  return urlQueue.contains(url);
 }
}

VisitedUrlQueue. java is mainly used to save the accessed URL, and HashSet is used to save it, mainly considering that each accessed URL is different. HashSet just meets this requirement


package com.sreach.spider;

import java.util.HashSet;

/**
 *  Accessed url Queue 
 * @author binghe
 * 
 */
public class VisitedUrlQueue {
 public static HashSet<String> visitedUrlQueue = new HashSet<String>();

 public synchronized static void addElem(String url) {
  visitedUrlQueue.add(url);
 }

 public synchronized static boolean isContains(String url) {
  return visitedUrlQueue.contains(url);
 }

 public synchronized static int size() {
  return visitedUrlQueue.size();
 }
}

Test. java This class is a test class


import java.sql.SQLException;

import com.sreach.spider.UrlDataHanding;
import com.sreach.spider.UrlQueue;
/**
 * @author binghe
 *
 */
public class Test {
 public static void main(String[] args) throws SQLException {
  String url = "http://www.oschina.net/code/explore/achartengine/client/AndroidManifest.xml";
  String url1 = "http://www.oschina.net/code/explore";
  String url2 = "http://www.oschina.net/code/explore/achartengine";
  String url3 = "http://www.oschina.net/code/explore/achartengine/client";

  UrlQueue.addElem(url);
  UrlQueue.addElem(url1);
  UrlQueue.addElem(url2);
  UrlQueue.addElem(url3);

  UrlDataHanding[] url_Handings = new UrlDataHanding[10];

  for (int i = 0; i < 10; i++) {
   url_Handings[i] = new UrlDataHanding();
   new Thread(url_Handings[i]).start();
  }

 }
}

Note 1 below: Because I crawled for oschina, so inside the url regular expression is not suitable for other sites, need to modify 1. You can also write xml for configuration.


Related articles: