c sharp crawl Web web page data analysis

  • 2020-05-05 11:47:22
  • OfStack

In order to complete the above requirements, we need to simulate the browser browsing web pages, get the data of the page in the analysis, and finally the structure of the analysis, that is, the sorted data written to the database. So the idea is:
1. Send the HttpRequest request.
2. Receive the results returned by HttpResponse. Gets the html source for a particular page.
3, take out the part of the source code that contains the data.
4. Generate HtmlDocument according to html source code, and loop out the data.
5. Write to the database.
The procedure is as follows:

// According to the Url The address gets the web page html The source code  
private string GetWebContent(string Url) 
{ 
string strResult=""; 
try 
{ 
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url); 
    // Declare a HttpWebRequest request  
request.Timeout = 30000; 
// Set the connection timeout  
request.Headers.Set("Pragma", "no-cache"); 
HttpWebResponse response = (HttpWebResponse)request.GetResponse(); 
Stream streamReceive = response.GetResponseStream(); 
Encoding encoding = Encoding.GetEncoding("GB2312"); 
StreamReader streamReader = new StreamReader(streamReceive, encoding); 
strResult = streamReader.ReadToEnd(); 
} 
catch 
{ 
MessageBox.Show(" error "); 
} 
return strResult; 
} 
 In order to use HttpWebRequest and HttpWebResponse , need to fill the namespace reference  
  using System.Net; 
 The following is the specific implementation process of the program:  
private void button1_Click(object sender, EventArgs e) 
{ 
// Want to grab URL address  
string Url = "http://list.mp3.baidu.com/topso/mp3topsong.html?id=1#top2"; 
// Get the specified Url The source of  
   string strWebContent = GetWebContent(Url); 
richTextBox1.Text = strWebContent; 
   // Take out the piece of source code related to the data  
int iBodyStart = strWebContent.IndexOf("<body", 0); 
int iStart = strWebContent.IndexOf(" song TOP500", iBodyStart); 
int iTableStart = strWebContent.IndexOf("<table", iStart); 
int iTableEnd = strWebContent.IndexOf("</table>", iTableStart); 
string strWeb = strWebContent.Substring(iTableStart, iTableEnd - iTableStart + 8); 
// generate HtmlDocument 
   WebBrowser webb = new WebBrowser(); 
webb.Navigate("about:blank"); 
HtmlDocument htmldoc = webb.Document.OpenNew(true); 
htmldoc.Write(strWeb); 
HtmlElementCollection htmlTR = htmldoc.GetElementsByTagName("TR"); 
foreach (HtmlElement tr in htmlTR) 
{ 
string strID = tr.GetElementsByTagName("TD")[0].InnerText; 
string strName = SplitName(tr.GetElementsByTagName("TD")[1].InnerText, "MusicName"); 
string strSinger = SplitName(tr.GetElementsByTagName("TD")[1].InnerText, "Singer"); 
strID = strID.Replace(".", ""); 
// insert DataTable 
AddLine(strID, strName, strSinger,"0"); 
string strID1 = tr.GetElementsByTagName("TD")[2].InnerText; 
string strName1 = SplitName(tr.GetElementsByTagName("TD")[3].InnerText, "MusicName"); 
string strSinger1 = SplitName(tr.GetElementsByTagName("TD")[3].InnerText, "Singer"); 
// insert DataTable 
strID1 = strID1.Replace(".", ""); 
AddLine(strID1, strName1, strSinger1,"0"); 
string strID2 = tr.GetElementsByTagName("TD")[4].InnerText; 
string strName2 = SplitName(tr.GetElementsByTagName("TD")[5].InnerText, "MusicName"); 
string strSinger2 = SplitName(tr.GetElementsByTagName("TD")[5].InnerText, "Singer"); 
// insert DataTable 
strID2 = strID2.Replace(".", ""); 
AddLine(strID2, strName2, strSinger2,"0"); 
} 
// Insert database  
InsertData(dt); 
    
dataGridView1.DataSource = dt.DefaultView; 
} 

Related articles: