<?php
//Contains the PHP Simple HTML Dom class library file
include_once('./simplehtmldom/simple_html_dom.php');
//Collect HTML
function getwebcontent($url){
$ch = curl_init();
$timeout = 10;
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);
$contents = trim(curl_exec($ch));
curl_close($ch);
return $contents;
}
//Get the title and url
$string =
getwebcontent('http://www.babytree.com/learn/zhunbeihuaiyun/jijibeiyun/2');
//Regular matching <Li> Gets the title and address
preg_match_all ("/<li><a href="/learn/article/(.*)">(.*)</a>/",
$string, $out, PREG_SET_ORDER);
foreach($out as $key => $value){
$article['title'][] = $out[$key][2];
$article['link'][] = "http://www.babytree.com/learn/article/".$out[$key][1];
}
//Get the article content based on the url
foreach($article['link'] as $key=>$value){
$html = file_get_html($value);
$div = $html->find('div[id=pagenum_0]');
$article[content][] = $div[0]->innertext;
}
//Title transcoding - don't do this when you're actually using it - because we were going to use utf8
//It really can't be saved to a file without transcoding
foreach($article[title] as $key=>$value){
$article[title][$key] = iconv('utf-8', 'gbk', $value);//transcoding
}
//In the file
$num = count($article['title']);
for($i=0; $i<$num; $i++){
file_put_contents("{$article[title][$i]}.txt", $article['content'][$i]);
}
/* Wanted to 12 It was sent before... But look down on all of them 3 It's half past... Let's call it yesterday
Regular is the best and fastest way to get the content of an article.
How regular is good, but regular expressions are really difficult! So I did a little research,
There are also a lot of people using it online PHP Simple Dom Although the efficiency is a bit slow, the effect is good
From containing class library files to writing txt file Is probably 7/8 Just seconds There are further optimizations, especially the regularness of getting the content of the article, which is disgusting
You can study it a little bit */
?>