对于PHP采集很多种方式,但是都是指定dom哪个节点来采集内容,这样太麻烦了,有没有一方式可以任意网址可以自动识别主本内容呢。答案是有。
用到的组件:
- QueryList:先用Queyrylist采集页面内容html
- readability:把采集到的HTML原始内容给readability分析树结构,提取页面内容。
组件仓库地址:
https://github.com/andreskrey/readability.php
看效果:
代码:
php
public function readability()
{
$param = $this->request->param();
if($this->request->isPost())
{
$url = trim($param['url']);
$ql = QueryList::get($url);
$html = $ql->getHtml();
//$html = file_get_contents($url);
//$html = HttpService::request($url);
$rules = [
'keywords' => ['meta[name=keywords]','content'],
'description' => ['meta[name=description]','content']
];
$query = $ql->rules($rules)->queryData();
$readability = new Readability(new Configuration());
$readability->parse($html);
$data['title'] = $readability->getTitle();
$data['seo_title'] = $readability->getTitle();
$data['keywords'] = $query['keywords'];
$data['description'] = '';
$data['source'] = $url;
$data['status'] = true;
$content = $readability->getContent();
$content = preg_replace("/<!--[^\!\[]*?(?<!\/\/)-->/","",$content);//删除注释
$content = preg_replace("/<div (id|class)=('|\")(.*?)(\'|\")>/","",$content);//删除最外层idv
$content = preg_replace("/(↵|\r\n|\n|\r)<\/div>$/","",$content);//删除最外层idv
$data['content'] = $content;
$data['thumb'] = $readability->getImage();
$images = $readability->getImages();
$photos = [];
if(!empty($images))
{
foreach($photos as $v)
{
$photo = [
'src' => $v,
'alt' => '',
];
array_push($photos,$photo);
}
$data['photos'] = $photos;
}
return $data;
}
else
{
return $this->fetch();
}
}