Java analysis of HTML algorithm of Java web spider algorithm example

2020-04-01 03:06:45
OfStack

Encounter complex and tedious HTML pages are daunting. Because it's hard to get the data.

The oldest way to do this is to try to use regular expressions.

The second method USES the open source organization htmlparser package, which is an older project, but the effect estimation is not very good, it seems that you can't in-depth analysis of HTML, can only analyze 5-level structure;

I have here a htmlparser source code, can get all the hyperlinks


   
package test;
import java.util.HashMap;
import java.util.Map;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;

public class GetLinkTest {
    public static void main(String[] args) {
        try {
            //Filter out through a filter. A> The label
            Parser parser = new Parser("//www.jb51.net");
            NodeList nodeList = parser.extractAllNodesThatMatch(new NodeFilter() {
                //Implement this method to filter labels
                public boolean accept(Node node) {
                    if (node instanceof LinkTag)//tag
                    {
                        return true;
                    }
                    return false;
                }
            });
            //print
            for (int i = 0; i < nodeList.size(); i++) {
                LinkTag n = (LinkTag) nodeList.elementAt(i);
                //System.out.print(n.getStringText() + " ==>> ");
                //System.out.println(n.extractLink());
                try {
                    if (n.extractLink().equals("//www.jb51.net")) {
                        System.out.println(n.extractLink());
                    }
                } catch (Exception e) {
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

The third method, which I have been using now, is to clean up the HTML into XML, and then use Java to parse the XML to get the data, now upload a Java clean HTML source code:



package exec;

import java.io.File;
import java.io.IOException;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.PrettyXmlSerializer;
import org.htmlcleaner.TagNode;


public class HtmlClean {

    public void cleanHtml(String htmlurl, String xmlurl) {
        try {
            long start = System.currentTimeMillis();

            HtmlCleaner cleaner = new HtmlCleaner();
            CleanerProperties props = cleaner.getProperties();
            props.setUseCdataForScriptAndStyle(true);
            props.setRecognizeUnicodeChars(true);
            props.setUseEmptyElementTags(true);
            props.setAdvancedXmlEscape(true);
            props.setTranslateSpecialEntities(true);
            props.setBooleanAttributeValues("empty");

            TagNode node = cleaner.clean(new File(htmlurl));

            System.out.println("vreme:" + (System.currentTimeMillis() - start));

            new PrettyXmlSerializer(props).writeXmlToFile(node, xmlurl);

            System.out.println("vreme:" + (System.currentTimeMillis() - start));
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}