Java USES Nagao algorithm to find new words hot words mining

  • 2020-04-01 04:05:35
  • OfStack

Nagao algorithm is used to calculate the frequency of each substring, and then the word frequency, number of left and right neighbors, left and right entropy, and interaction information (internal cohesion) of each string are calculated based on these frequencies.

Noun explanation:

  Nagao algorithm: a fast algorithm for counting the frequency of all substrings in text. Detailed algorithm is available (link: http://www.doc88.com/p-664123446503.html)
  Word frequency: the number of times the string appears in the document. The more you see, the more important it is.
  Left and right neighbors: the number of different words that appear on the left and right of the string in the document. The more left and right neighbors, the higher the probability of string word formation.
  Left/right entropy: the entropy of the distribution of the number of different words that appear on the left and right of the string in the document. Similar to the above index, there is a certain difference.
  Interactive information: divide a string into two parts each time, the left half of the string and the right half of the string, calculate the probability of its simultaneous occurrence divided by the probability of its independent occurrence, and finally take the probability of the least value in all the partition. The larger the value, the higher the cohesion of the string, the more likely it is to be a word.

Specific flow of the algorithm:

1.   Read in the input file line by line, according to the non-chinese characters ([^ ^ u4E00-\u9FA5]+) and the stop word "is it too much?
The code is as follows:
String [] phrases = line. The split (" [^ \ u4E00 \ u9FA5] + | / "+ stopwords +" ");
Stop words can be modified.
2.   Gets the left and right substrings of all the sharded strings, and adds the left and right PTable, respectively
3.   Sort the PTable and calculate the LTable. LTable records the number of characters in the next substring in the sorted PTable that are the same as the previous substring
4.   By traversing PTable and LTable, the word frequency and left and right neighbors of all substrings can be obtained
5.   According to the word frequency and left and right adjacent results of all substrings, the word frequency, left and right adjacent number, left and right entropy and interactive information of the string are output

1.   NagaoAlgorithm. Java


package com.algo.word;
 
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
 
public class NagaoAlgorithm {
   
  private int N;
   
  private List<String> leftPTable;
  private int[] leftLTable;
  private List<String> rightPTable;
  private int[] rightLTable;
  private double wordNumber;
   
  private Map<String, TFNeighbor> wordTFNeighbor;
   
  private final static String stopwords = " Of very yao it is well all also ratio return this at not with just top use is good and rightness straight after did not say ";
   
  private NagaoAlgorithm(){
    //default N = 5
    N = 5;
    leftPTable = new ArrayList<String>();
    rightPTable = new ArrayList<String>();
    wordTFNeighbor = new HashMap<String, TFNeighbor>();
  }
  //reverse phrase
  private String reverse(String phrase) {
    StringBuilder reversePhrase = new StringBuilder();
    for (int i = phrase.length() - 1; i >= 0; i--)
      reversePhrase.append(phrase.charAt(i));
    return reversePhrase.toString();
  }
  //co-prefix length of s1 and s2
  private int coPrefixLength(String s1, String s2){
    int coPrefixLength = 0;
    for(int i = 0; i < Math.min(s1.length(), s2.length()); i++){
      if(s1.charAt(i) == s2.charAt(i))  coPrefixLength++;
      else break;
    }
    return coPrefixLength;
  }
  //add substring of line to pTable
  private void addToPTable(String line){
    //split line according to consecutive none Chinese character
    String[] phrases = line.split("[^u4E00-u9FA5]+|["+stopwords+"]");
    for(String phrase : phrases){
      for(int i = 0; i < phrase.length(); i++)
        rightPTable.add(phrase.substring(i));
      String reversePhrase = reverse(phrase);
      for(int i = 0; i < reversePhrase.length(); i++)
        leftPTable.add(reversePhrase.substring(i));
      wordNumber += phrase.length();
    }
  }
   
  //count lTable
  private void countLTable(){
    Collections.sort(rightPTable);
    rightLTable = new int[rightPTable.size()];
    for(int i = 1; i < rightPTable.size(); i++)
      rightLTable[i] = coPrefixLength(rightPTable.get(i-1), rightPTable.get(i));
     
    Collections.sort(leftPTable);
    leftLTable = new int[leftPTable.size()];
    for(int i = 1; i < leftPTable.size(); i++)
      leftLTable[i] = coPrefixLength(leftPTable.get(i-1), leftPTable.get(i));
     
    System.out.println("Info: [Nagao Algorithm Step 2]: having sorted PTable and counted left and right LTable");
  }
  //according to pTable and lTable, count statistical result: TF, neighbor distribution
  private void countTFNeighbor(){
    //get TF and right neighbor
    for(int pIndex = 0; pIndex < rightPTable.size(); pIndex++){
      String phrase = rightPTable.get(pIndex);
      for(int length = 1 + rightLTable[pIndex]; length <= N && length <= phrase.length(); length++){
        String word = phrase.substring(0, length);
        TFNeighbor tfNeighbor = new TFNeighbor();
        tfNeighbor.incrementTF();
        if(phrase.length() > length)
          tfNeighbor.addToRightNeighbor(phrase.charAt(length));
        for(int lIndex = pIndex+1; lIndex < rightLTable.length; lIndex++){
          if(rightLTable[lIndex] >= length){
            tfNeighbor.incrementTF();
            String coPhrase = rightPTable.get(lIndex);
            if(coPhrase.length() > length)
              tfNeighbor.addToRightNeighbor(coPhrase.charAt(length));
          }
          else break;
        }
        wordTFNeighbor.put(word, tfNeighbor);
      }
    }
    //get left neighbor
    for(int pIndex = 0; pIndex < leftPTable.size(); pIndex++){
      String phrase = leftPTable.get(pIndex);
      for(int length = 1 + leftLTable[pIndex]; length <= N && length <= phrase.length(); length++){
        String word = reverse(phrase.substring(0, length));
        TFNeighbor tfNeighbor = wordTFNeighbor.get(word);
        if(phrase.length() > length)
          tfNeighbor.addToLeftNeighbor(phrase.charAt(length));
        for(int lIndex = pIndex + 1; lIndex < leftLTable.length; lIndex++){
          if(leftLTable[lIndex] >= length){
            String coPhrase = leftPTable.get(lIndex);
            if(coPhrase.length() > length)
              tfNeighbor.addToLeftNeighbor(coPhrase.charAt(length));
          }
          else break;
        }
      }
    }
    System.out.println("Info: [Nagao Algorithm Step 3]: having counted TF and Neighbor");
  }
  //according to wordTFNeighbor, count MI of word
  private double countMI(String word){
    if(word.length() <= 1)  return 0;
    double coProbability = wordTFNeighbor.get(word).getTF()/wordNumber;
    List<Double> mi = new ArrayList<Double>(word.length());
    for(int pos = 1; pos < word.length(); pos++){
      String leftPart = word.substring(0, pos);
      String rightPart = word.substring(pos);
      double leftProbability = wordTFNeighbor.get(leftPart).getTF()/wordNumber;
      double rightProbability = wordTFNeighbor.get(rightPart).getTF()/wordNumber;
      mi.add(coProbability/(leftProbability*rightProbability));
    }
    return Collections.min(mi);
  }
  //save TF, (left and right) neighbor number, neighbor entropy, mutual information
  private void saveTFNeighborInfoMI(String out, String stopList, String[] threshold){
    try {
      //read stop words file
      Set<String> stopWords = new HashSet<String>();
      BufferedReader br = new BufferedReader(new FileReader(stopList));
      String line;
      while((line = br.readLine()) != null){
        if(line.length() > 1)
          stopWords.add(line);
      }
      br.close();
      //output words TF, neighbor info, MI
      BufferedWriter bw = new BufferedWriter(new FileWriter(out));
      for(Map.Entry<String, TFNeighbor> entry : wordTFNeighbor.entrySet()){
        if( entry.getKey().length() <= 1 || stopWords.contains(entry.getKey()) ) continue;
        TFNeighbor tfNeighbor = entry.getValue();
         
         
        int tf, leftNeighborNumber, rightNeighborNumber;
        double mi;
        tf = tfNeighbor.getTF();
        leftNeighborNumber = tfNeighbor.getLeftNeighborNumber();
        rightNeighborNumber = tfNeighbor.getRightNeighborNumber();
        mi = countMI(entry.getKey());
        if(tf > Integer.parseInt(threshold[0]) && leftNeighborNumber > Integer.parseInt(threshold[1]) && 
            rightNeighborNumber > Integer.parseInt(threshold[2]) && mi > Integer.parseInt(threshold[3]) ){
          StringBuilder sb = new StringBuilder();
          sb.append(entry.getKey());
          sb.append(",").append(tf);
          sb.append(",").append(leftNeighborNumber);
          sb.append(",").append(rightNeighborNumber);
          sb.append(",").append(tfNeighbor.getLeftNeighborEntropy());
          sb.append(",").append(tfNeighbor.getRightNeighborEntropy());
          sb.append(",").append(mi).append("n");
          bw.write(sb.toString());
        }
      }
      bw.close();
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
    System.out.println("Info: [Nagao Algorithm Step 4]: having saved to file");
  }
  //apply nagao algorithm to input file
  public static void applyNagao(String[] inputs, String out, String stopList){
    NagaoAlgorithm nagao = new NagaoAlgorithm();
    //step 1: add phrases to PTable
    String line;
    for(String in : inputs){
      try {
        BufferedReader br = new BufferedReader(new FileReader(in));
        while((line = br.readLine()) != null){
          nagao.addToPTable(line);
        }
        br.close();
      } catch (IOException e) {
        throw new RuntimeException();
      }
    }
    System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable");
    //step 2: sort PTable and count LTable
    nagao.countLTable();
    //step3: count TF and Neighbor
    nagao.countTFNeighbor();
    //step4: save TF NeighborInfo and MI
    nagao.saveTFNeighborInfoMI(out, stopList, "20,3,3,5".split(","));
  }
  public static void applyNagao(String[] inputs, String out, String stopList, int n, String filter){
    NagaoAlgorithm nagao = new NagaoAlgorithm();
    nagao.setN(n);
    String[] threshold = filter.split(",");
    if(threshold.length != 4){
      System.out.println("ERROR: filter must have 4 numbers, seperated with ',' ");
      return;
    }
    //step 1: add phrases to PTable
    String line;
    for(String in : inputs){
      try {
        BufferedReader br = new BufferedReader(new FileReader(in));
        while((line = br.readLine()) != null){
          nagao.addToPTable(line);
        }
        br.close();
      } catch (IOException e) {
        throw new RuntimeException();
      }
    }
    System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable");
    //step 2: sort PTable and count LTable
    nagao.countLTable();
    //step3: count TF and Neighbor
    nagao.countTFNeighbor();
    //step4: save TF NeighborInfo and MI
    nagao.saveTFNeighborInfoMI(out, stopList, threshold);
  }
  private void setN(int n){
    N = n;
  }
   
  public static void main(String[] args) {
    String[] ins = {"E://test//ganfen.txt"};
    applyNagao(ins, "E://test//out.txt", "E://test//stoplist.txt");
  }
 
}

2. TFNeighbor. Java


package com.algo.word;
 
import java.util.HashMap;
import java.util.Map;
 
public class TFNeighbor {
 
  private int tf;
  private Map<Character, Integer> leftNeighbor;
  private Map<Character, Integer> rightNeighbor;
   
  TFNeighbor(){
    leftNeighbor = new HashMap<Character, Integer>();
    rightNeighbor = new HashMap<Character, Integer>();
  }
  //add word to leftNeighbor
  public void addToLeftNeighbor(char word){
    //leftNeighbor.put(word, 1 + leftNeighbor.getOrDefault(word, 0));
    Integer number = leftNeighbor.get(word);
    leftNeighbor.put(word, number == null? 1: 1+number);
  }
  //add word to rightNeighbor
  public void addToRightNeighbor(char word){
    //rightNeighbor.put(word, 1 + rightNeighbor.getOrDefault(word, 0));
    Integer number = rightNeighbor.get(word);
    rightNeighbor.put(word, number == null? 1: 1+number);
  }
  //increment tf
  public void incrementTF(){
    tf++;
  }
  public int getLeftNeighborNumber(){
    return leftNeighbor.size();
  }
  public int getRightNeighborNumber(){
    return rightNeighbor.size();
  }
  public double getLeftNeighborEntropy(){
    double entropy = 0;
    int sum = 0;
    for(int number : leftNeighbor.values()){
      entropy += number*Math.log(number);
      sum += number;
    }
    if(sum == 0)  return 0;
    return Math.log(sum) - entropy/sum;
  }
  public double getRightNeighborEntropy(){
    double entropy = 0;
    int sum = 0;
    for(int number : rightNeighbor.values()){
      entropy += number*Math.log(number);
      sum += number;
    }
    if(sum == 0)  return 0;
    return Math.log(sum) - entropy/sum;
  }
  public int getTF(){
    return tf;
  }
}

3. Main. Java


package com.algo.word;
 
public class Main {
 
  public static void main(String[] args) {
     
    //if 3 arguments, first argument is input files splitting with ','
    //second argument is output file
    //output 7 columns split with ',' , like below:
    //word, term frequency, left neighbor number, right neighbor number, left neighbor entropy, right neighbor entropy, mutual information
    //third argument is stop words list
    if(args.length == 3)
      NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2]);
     
    //if 4 arguments, forth argument is the NGram parameter N
    //5th argument is threshold of output words, default is "20,3,3,5"
    //output TF > 20 && (left | right) neighbor number > 3 && MI > 5
    else if(args.length == 5)
      NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2], Integer.parseInt(args[3]), args[4]);
     
     
  }
 
}

The above is all the content of this article, I hope you can enjoy it.


Related articles: