Example of a Java regular expression matching all urls and link text on a web page

2020-04-01 03:06:52
OfStack

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.net.*;
import java.io.*;
import java.util.regex.*;

public class Urls
{
    private String startUrl;                                         //Start collecting urls
    String  urlContent;
    String ContentArea;
    private String strAreaBegin ,strAreaEnd ;            //The collection area begins and ends the collection string
    private String stringInUrl,stringNotInUrl;        
    String strContent;//Acquire the collection content
    String[] allUrls;                                                            //All urls collected
    private String  regex;                                                 //Acquisition rules

    UrlAndTitle   urlAndTitle=new UrlAndTitle();    //Store url and title & NBSP;                                    

    
    public static void main(String[] args)
    {
         Urls myurl=new Urls("<body","/body>");
         myurl.getStartUrl("http://www.zuzwn.com/");
         myurl.getUrlContent();
         myurl.getContentArea();
         myurl.getStartUrl("http://www.zuzwn.com/");
         myurl.getStringNotInUrl("google");
         myurl.Urls();

        //System.out.println("startUrl:"+myurl.startUrl);
        //System.out.println("urlcontent:"+myurl.urlContent);
        //System.out.println("ContentArea:"+myurl.ContentArea);

    }

    
    //Initializes the constructors strAreaBegin and strAreaEnd

    public Urls (String strAreaBegin,String strAreaEnd)
    {
        this.strAreaBegin=strAreaBegin;
        this.strAreaEnd=strAreaEnd;
  }

  //
    public void Urls()
    {
        int i=0;
        //String regex ="<a href="?'?http://[a-zA-Z0-9]+/.[a-zA-Z0-9]+/.[a-zA-Z]+/?[/.?[/S|/s]]+[a>]$";
        String regex ="<a.*?/a>";
         //String regex ="http://.*?>";
        Pattern pt=Pattern.compile(regex);
        Matcher mt=pt.matcher(ContentArea);
        while(mt.find())
         {
                 System.out.println(mt.group());
                 i++;

                 //Get the title
                 Matcher title=Pattern.compile(">.*?</a>").matcher(mt.group()); 
                 while(title.find())
                 {
                      System.out.println(" The title :"+title.group().replaceAll(">|</a>",""));
                 }

                 //Access to the website
                 Matcher myurl=Pattern.compile("href=.*?>").matcher(mt.group()); 
                 while(myurl.find())
                 {
                      System.out.println(" The url :"+myurl.group().replaceAll("href=|>",""));
                 }

                 System.out.println();

                 
         }

        System.out.println(" A total of "+i+" Individual consistent results ");

    }    

    // To obtain Start collecting urls
    public void getStartUrl(String startUrl)
    {
        this.startUrl=startUrl;
    }

    //Get the content of the url;
    public void getUrlContent()
    {

        StringBuffer is=new StringBuffer();
        try
        {
            URL myUrl=new URL(startUrl);
            BufferedReader br= new BufferedReader(
                                                        new InputStreamReader(myUrl.openStream()));

            String s;                                                
            while((s=br.readLine())!=null)
            {
                is.append(s);
            }                                            
            urlContent=is.toString();
        }
    catch(Exception e)

    { 
        System.out.println(" The url file could not be output ");
        e.printStackTrace();
    }

        
    }

    
    //Get the matching area section where the url is located
    public void getContentArea()
    {
         int pos1=0,pos2=0;
         pos1= urlContent.indexOf(strAreaBegin)+strAreaBegin.length();
         pos2=urlContent.indexOf(strAreaEnd,pos1);
         ContentArea=urlContent.substring(pos1,pos2); 
    }

    //The following two functions get the keywords that should and should not be included in the url
    //This is just a preliminary experiment. Later, there should be more than one keyword protected and not included.
    public void getStringInUrl(String stringInUrl)
    {
         this.stringInUrl=stringInUrl;        

    }

    public void getStringNotInUrl(String stringNotInUrl)
    {
        this.stringNotInUrl=stringNotInUrl;
    }

    // To obtain Acquisition rules

    //Get the url
    public void getUrl()
    {

    }

    public String getRegex()
    {
        return regex;

    }

    class UrlAndTitle
    {
        String myURL;
        String title;
    }
}