PHP articles capture regular code

2020-03-31 19:58:50
OfStack
 
//Collect HTML
function getwebcontent($url){ 
$ch = curl_init(); 
$timeout = 10; 
curl_setopt($ch, CURLOPT_URL, $url); 
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1); 
$contents = trim(curl_exec($ch)); 
curl_close($ch); 
return $contents; 
} 


//Get the title and url
$string = 
getwebcontent('http://www.***.com/learn/zhunbeihuaiyun/jijibeiyun/2'); 
//Regular matching <Li> Gets the title and address
preg_match_all ("/<li><a href="/learn/article/(.*)">(.*)</a>/",$string, $out, PREG_SET_ORDER); 
foreach($out as $key => $value){ 
$article['title'][] = $out[$key][2]; 
$article['link'][] = "http://www.***.com/learn/article/".$out[$key][1]; 
} 
//Get the article content based on the url
foreach($article['link'] as $key=>$value){ 
$content_html = getwebcontent($article['link'][$key]); 
preg_match("/<div id=pagenum_0(.*)>[s|S]*?</div>/",$content_html,$matches); 
$article[content][$key] = $matches[0]; 

} 
//It really can't be saved to a file without transcoding
foreach($article[title] as $key=>$value){ 
$article[title][$key] = iconv('utf-8', 'gbk', $value);//transcoding
} 
//In the file
$num = count($article['title']); 
for($i=0; $i<$num; $i++){ 
file_put_contents("{$article[title][$i]}.txt", $article['content'][$i]); 
} 
?>