Java USES open source framework to implement simple web search engine

  • 2020-11-30 08:20:41
  • OfStack

The introduction

Use Java's open source library to write a search engine that can crawl the content of a website. And according to the content of the web page for deep crawling, access to all relevant web address and content, users can through keywords, search all relevant web sites.

Specific function

(1) The user can specify the contents of a web page corresponding to url to be crawled.
(2) Analyze the content of the web page and obtain all the url link addresses.
(3) The user can set the crawling depth, which means that starting from the page corresponding to the original url, the user can crawl all the PAGES corresponding to url, and so on. The deeper you go, the more sites you can crawl.
(4) The contents of url crawled are saved and indexed. The content being indexed is the url address itself, and the page title corresponding to url.
(5) Users can search for the url address by keywords.
(6) The process of building index and searching index can intelligently identify Chinese keywords and carry out word segmentation operation on keywords.
(7) The user can specify the address to save the index, the initial url, the depth of crawling, the keywords to search and the maximum match.

Open source framework

Lucene Jsoup

The source code

Crawler section: ES32en. java


package webCrawler.Spider;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Scanner;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import webCrawler.Index.BuildIndex;

/**
 * @author lannooo
 */

public class Spider {
  ArrayList<String> URLs;
  private String startURL;
  private int digLevel;

  /**
   * @param startURL  The beginnings of crawlers URL
   * @param digLevel  The depth of the crawl 
   */
  public Spider(String startURL, int digLevel){
    this.startURL = startURL;
    this.digLevel = digLevel;
    this.URLs = new ArrayList<>();
  }

  /**
   * @param level  The remaining depth of the current crawl 
   * @param arrayList  I need to move on 1 Climb to the top of wheel URL set 
   * @return  from 1 " url Set crawls to get the new URL set 
   * @throws IOException
   */
  public ArrayList<String> getLevelURLs(int level, ArrayList<String> arrayList) 
      throws IOException{
    ArrayList<String> total = null;
    if(level>0){      
      total = new ArrayList<>();
      for(String url: arrayList){
        /* For each arrayList In the URL , first parse its web page content, and get all inside URL item */
        for(String each: getBareLinks(url)){
          total.add(each);
        }
      }
      /* with HashSet This container will total Delete duplicate entries */
      HashSet<String> hashSet = new HashSet<>(total);
      total = new ArrayList<>(hashSet);
    }
    return total;
  }

  /**
   *  from startURL Start crawling all related URLs
   * @throws IOException
   */
  public void getAll() throws IOException{
    ArrayList<String> newURLs;
    ArrayList<String> currentURLs = new ArrayList<>();
    /* the startURL join currentURLs From this list, from this url Began to climb */
    currentURLs.add(startURL);
    for(int i=digLevel; i>0; i--){
      /*
       *  For each 1 Layers, get them all 1 By this url By extension url set 
       *  Then put the current set of the ones that have crawled before url Add to the total URL concentrated 
       *  The last newURLs Enter as a new set that requires deep crawling 1 cycle 
       */
      System.out.println("Dig into level: " + (digLevel-i+1));
      newURLs = getLevelURLs(i, currentURLs);
      for(String each: currentURLs){
        URLs.add(each);
      }
      currentURLs = newURLs;
    }
    for(String each:currentURLs){
      URLs.add(each);
    }
    HashSet<String> hashSet = new HashSet<>(URLs);
    URLs = new ArrayList<>(hashSet);
  }

  /**
   * @param path  Save the path to the index 
   * @throws IOException
   */
  public void storeURLsAndInfo(String path) throws IOException{
    BuildIndex build = new BuildIndex(path);
    /*  the URLs All of the url Crawl the actual page title */
    for(String each:URLs){
      String text = getLinkText(each);
      if(text!=null){
        build.addField("url", each);
        build.addField("text", text);
        /* Will this 1 a entry Add to index */
        build.pushIndex();
      }
    }
    build.close();
  }

  /**
   * @param url  Need to get the title of the page url
   * @return  The title content 
   * @throws IOException
   */
  public String getLinkText(String url) throws IOException{
    Document document = null;
    try {
      /* with Jsoup Connect and set the timeout to 3 seconds */
      document = Jsoup.connect(url).timeout(3000).get();
    } catch (Exception e) {
      System.out.println("[TIMEOUT]Get title of url:"+url);
      return null;
    }
    String title = document.title();
    return title;
  }


  /**
   * @param url  Content parsing url
   * @return  Returns the url All within the content of the web page urls The list of 
   * @throws IOException
   */
  public ArrayList<String> getBareLinks(String url) throws IOException{
    ArrayList<String> linksList = new ArrayList<>();
    Document document;

    try {
      document = Jsoup.connect(url).timeout(2000).get();
    } catch (Exception e) {
      return linksList;
    }
    /* To obtain <body> All bands of label management href Properties of the <a> The label */
    Elements links = document.select("body").select("a[href]");

    for(Element link: links){
      /* From each 1 That's an analytic answer <a> Extract from label url And remove the anchor points */
      String href = link.attr("abs:href").replaceAll("#", "");
      /* Add only containing zju.edu.cn The character url And get rid of the end '/'*/
      if(href.contains("zju.edu.cn")){
        if (href.endsWith("/")){
          href = href.substring(0, href.length()-1);
        }
        linksList.add(href);
      }
    }
    HashSet<String> hashSet = new HashSet<>(linksList);
    ArrayList<String> arrayList = new ArrayList<>(hashSet);

    return arrayList;
  }

  public static void main(String[] args) {
    Scanner in = new Scanner(System.in);
    System.out.println("Enter url:");
    String url = in.nextLine().trim();
    while(!url.startsWith("http://")){
      System.out.println("http:// is needed!");
      System.out.println("Enter url:");
      url = in.nextLine().trim();
    }
    System.out.println("Enter depth to dig more urls[<=3 recommended] : ");
    int depth = in.nextInt();
    Spider spider = new Spider(url, depth);
    System.out.println("Enter path you want to save[default=d:/index-spider]:");
    String path = in.nextLine().trim();
    if(path.length()==0){
      path = "d:/index-spider";
    }
    try {
      System.out.println("Start fetching...");
      spider.getAll();
      System.out.println("Urls got success!");
      spider.storeURLsAndInfo(path);
      System.out.println("Stored success!");
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

 
}

Build index: ES37en.java


package webCrawler.Index;

import java.io.*;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
 * @author lannooo
 *
 */
public class BuildIndex {
  private File file;
  private Directory directory;
  private IndexWriter indexWriter;
  private IndexWriterConfig config;
  private Analyzer analyzer;
  private Document document;

  /**
   * @param path  The path to build the index 
   */
  public BuildIndex(String path) {
    try {
      file = new File(path);
      directory = FSDirectory.open(file);
      document = new Document();
      analyzer = new IKAnalyzer();    /* Chinese word segmentation tool class */
      config = new IndexWriterConfig(Version.LUCENE_4_10_0, analyzer);
      indexWriter = new IndexWriter(directory, config);      

    } catch (Exception e) {
      e.printStackTrace();
    }
  }

  /**
   * @param fieldName  To join the document In the new 1 The name of the item 
   * @param fieldText  The new 1 The content of the item 
   */
  public void addField(String fieldName, String fieldText){
    try{
      Field field = new TextField(fieldName, fieldText, Field.Store.YES);
      document.add(field);
    }catch (Exception e) {
      e.printStackTrace();
    }
  }

  /**
   *  will document Add to index 
   */
  public void pushIndex(){
    try {
      indexWriter.addDocument(document);
      document = new Document();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

  /**
   *  Add the full 1 a document And save it to the index 
   * @param url  To join the url address 
   * @param text url Corresponding text 
   */
  public void addOneIndex(String url, String text){
    this.addField("url", url);
    this.addField("text", text);
    this.pushIndex();
  }

  /**
   *  Close index write 
   */
  public void close(){
    try {
      indexWriter.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

}

The search index


package webCrawler.Index;

import java.io.File;
import java.util.Scanner;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
 * @author lannooo
 *
 */
public class SearchIndex {
  private IndexSearcher indexSearcher;
  private Analyzer analyzer;
  private QueryParser parser;
  private Query query;
  private TopDocs hits;
  private DirectoryReader reader;

  /**
   * @param path  The path to index the search 
   */
  public SearchIndex(String path){
    try {
      reader = DirectoryReader.open(FSDirectory.open(new File(path)));
      indexSearcher = new IndexSearcher(reader);
      analyzer = new IKAnalyzer();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

  /**
   * @param fieldName  The name of the domain to search 
   * @param text  Search content 
   * @param matchNumber  Maximum number of matches 
   * @return  The maximum number of matches found 
   */
  public int search(String fieldName, String text, int matchNumber){
    try {
      parser = new QueryParser(fieldName, analyzer);
      query = parser.parse(text);
      hits = indexSearcher.search(query, matchNumber);

      return hits.totalHits;
    } catch (Exception e) {
      e.printStackTrace();
    }
    return -1;
  }
  /**
   *  Print all matches 
   */
  public void printHits(){
    try{
      System.out.println("Total hits number:"+hits.totalHits);
      for(ScoreDoc doc: hits.scoreDocs){
        Document document = indexSearcher.doc(doc.doc);
        System.out.println(document.get("url"));
        System.out.println(document.get("text"));
      }
      reader.close();
    }catch (Exception e) {
      e.printStackTrace();
    }
  }
  public static void main(String[] args) {
    /* Enter keywords */
    Scanner in = new Scanner(System.in);
    System.out.println("Enter path of the index:");
    String path = in.nextLine().trim();
    while(path.length()==0){
      System.out.println("Enter path of the index:");
      path = in.nextLine().trim();
    }

    System.out.println("Enter max hit number:");
    int max = in.nextInt();
    while(max<0){
      System.out.println("Enter max hit number:");
      max = in.nextInt();
    }
    in.nextLine();
    System.out.print("Search>>> ");
    String text = in.nextLine().trim();
    /* Loop in the user's keyword, if yes q Exits with a length of 0 Also exit */
    while(!text.equals("q")){
      if(text.length()>0){
        SearchIndex search = new SearchIndex(path);
        int hits = search.search("text", text, max);
        if(hits!=-1){
          search.printHits();
        }
      }
      System.out.print("Search>>> ");
      text = in.nextLine().trim();
    }
  }
}

UI interface (for convenience only in the form of command line, one GUI interface can be written as required)


package webCrawler.UI;

import java.util.Scanner;

import webCrawler.Index.SearchIndex;

/**
 * @author lannooo
 *
 */
public class UI {
  public static void main(String[] args) {
    /* Enter keywords */
    Scanner in = new Scanner(System.in);
    System.out.print("Search>>> ");
    String text = in.nextLine().trim();
    /* For the user's keywords, if yes q Exits with a length of 0 Also exit */
    while(!text.equals("q") && text.length()>0){
      SearchIndex search = new SearchIndex("d:/index-spider2");
      int hits = search.search("text", text, 20);
      if(hits!=-1){
        search.printHits();
      }
      System.out.print("Search>>> ");
      text = in.nextLine().trim();
    }
  }
}


Related articles: