Java regular expression parsing HTML sample share

  • 2020-04-01 02:59:11
  • OfStack

package work;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
public class chuanboyi {
 public static void main(String[] args){
  // TODO Auto-generated method stub
  StringBuffer html = new StringBuffer();
  HttpClient httpclient = new HttpClient();
  //Create an instance of the GET method
  GetMethod getMethod = new GetMethod("//");
  //Use the default recovery policy provided by the system
  getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());
   //Execute the GET method
   int statusCode = httpclient.executeMethod(getMethod);
   if(statusCode != HttpStatus.SC_OK){
    System.out.println("Method is wrong " + getMethod.getStatusLine());
   InputStream responseBody = getMethod.getResponseBodyAsStream();
   BufferedReader reader = new BufferedReader(new InputStreamReader(responseBody,"utf-8"));
   String line = reader.readLine();
   while(line != null){
    line = reader.readLine();
   //Regular expression
   String regex = "<form name="compareForm"[\s\S]+>[\s\S]+</form>.*<script.*>";
   String regexa ="(?<=<li>)[\s\S]+?(?=</li>)";
   Pattern pattern = Pattern.compile(regex);
         Matcher m = pattern.matcher(html);
         StringBuffer str = new StringBuffer(); 
         int i = 0; 
         pattern = Pattern.compile(regexa);
         m = pattern.matcher(str);
         System.out.println(" A total of "+i+" The data! ");
  }catch (HttpException e) {
   // TODO: handle exception
   System.out.println("Please check your provided http address!");
  }catch (IOException e) {
   // TODO: handle exception
   System.out.println("the line is wrong!");
   getMethod.releaseConnection();//Release the link
 public static void attrs(String str){

  // To obtain url the Regular expression
  String regexURL = "[a-z]+-[0-9]+\.html";
  // To obtain Name the Regular expression
  String regexName = "(?<=title=")[[\w-\s][^x00-xff]]+(?=")";
  // Get the picture Regular expression
  String regexPicture = "images.*\.jpg";

  Pattern patternURL = Pattern.compile(regexURL);
  Pattern patternName = Pattern.compile(regexName);
  Pattern patternPicture = Pattern.compile(regexPicture);
  Matcher mURL = patternURL.matcher(str);
  Matcher mName = patternName.matcher(str);
  Matcher mPicture = patternPicture.matcher(str);
   System.out.println(" The name :";
   System.out.println(" link :";
   System.out.println(" The picture :";

Related articles: