c sharp crawl Web web page data analysis
- 2020-05-05 11:47:22
- OfStack
In order to complete the above requirements, we need to simulate the browser browsing web pages, get the data of the page in the analysis, and finally the structure of the analysis, that is, the sorted data written to the database. So the idea is:
1. Send the HttpRequest request.
2. Receive the results returned by HttpResponse. Gets the html source for a particular page.
3, take out the part of the source code that contains the data.
4. Generate HtmlDocument according to html source code, and loop out the data.
5. Write to the database.
The procedure is as follows:
1. Send the HttpRequest request.
2. Receive the results returned by HttpResponse. Gets the html source for a particular page.
3, take out the part of the source code that contains the data.
4. Generate HtmlDocument according to html source code, and loop out the data.
5. Write to the database.
The procedure is as follows:
// According to the Url The address gets the web page html The source code
private string GetWebContent(string Url)
{
string strResult="";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
// Declare a HttpWebRequest request
request.Timeout = 30000;
// Set the connection timeout
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("GB2312");
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult = streamReader.ReadToEnd();
}
catch
{
MessageBox.Show(" error ");
}
return strResult;
}
In order to use HttpWebRequest and HttpWebResponse , need to fill the namespace reference
using System.Net;
The following is the specific implementation process of the program:
private void button1_Click(object sender, EventArgs e)
{
// Want to grab URL address
string Url = "http://list.mp3.baidu.com/topso/mp3topsong.html?id=1#top2";
// Get the specified Url The source of
string strWebContent = GetWebContent(Url);
richTextBox1.Text = strWebContent;
// Take out the piece of source code related to the data
int iBodyStart = strWebContent.IndexOf("<body", 0);
int iStart = strWebContent.IndexOf(" song TOP500", iBodyStart);
int iTableStart = strWebContent.IndexOf("<table", iStart);
int iTableEnd = strWebContent.IndexOf("</table>", iTableStart);
string strWeb = strWebContent.Substring(iTableStart, iTableEnd - iTableStart + 8);
// generate HtmlDocument
WebBrowser webb = new WebBrowser();
webb.Navigate("about:blank");
HtmlDocument htmldoc = webb.Document.OpenNew(true);
htmldoc.Write(strWeb);
HtmlElementCollection htmlTR = htmldoc.GetElementsByTagName("TR");
foreach (HtmlElement tr in htmlTR)
{
string strID = tr.GetElementsByTagName("TD")[0].InnerText;
string strName = SplitName(tr.GetElementsByTagName("TD")[1].InnerText, "MusicName");
string strSinger = SplitName(tr.GetElementsByTagName("TD")[1].InnerText, "Singer");
strID = strID.Replace(".", "");
// insert DataTable
AddLine(strID, strName, strSinger,"0");
string strID1 = tr.GetElementsByTagName("TD")[2].InnerText;
string strName1 = SplitName(tr.GetElementsByTagName("TD")[3].InnerText, "MusicName");
string strSinger1 = SplitName(tr.GetElementsByTagName("TD")[3].InnerText, "Singer");
// insert DataTable
strID1 = strID1.Replace(".", "");
AddLine(strID1, strName1, strSinger1,"0");
string strID2 = tr.GetElementsByTagName("TD")[4].InnerText;
string strName2 = SplitName(tr.GetElementsByTagName("TD")[5].InnerText, "MusicName");
string strSinger2 = SplitName(tr.GetElementsByTagName("TD")[5].InnerText, "Singer");
// insert DataTable
strID2 = strID2.Replace(".", "");
AddLine(strID2, strName2, strSinger2,"0");
}
// Insert database
InsertData(dt);
dataGridView1.DataSource = dt.DefaultView;
}