DomCrawler
https://symfony.com/doc/current/components/dom_crawler.html
安装
composer require symfony/dom-crawler
使用
require __DIR__ . '/vendor/autoload.php';
use Symfony\Component\DomCrawler\Crawler;
function Spider($url)
{
$response = get_html($url);
//进行XPath页面数据抽取
$data = []; //结构化数据存本数组
$crawler = new Crawler();
$crawler->addHtmlContent($response);
try {
$data['Title*'] = $crawler->filterXPath('//*[@id="product-info_productDetail"]/h1')->text();
$data['Subtitle'] = $crawler->filter('#xxx')->text();
} catch (\Exception $e) {
var_dump($e);exit;
}
return $data;
}
如何压缩HTML
function compress_html($string){
$string=str_replace("\r\n",'',$string);//清除换行符
$string=str_replace("\n",'',$string);//清除换行符
$string=str_replace("\t",'',$string);//清除制表符
$pattern=array(
"/> *([^ ]*) *</",//去掉注释标记
"/[\s]+/",
"/<!--[^!]*-->/",
"/\" /",
"/ \"/",
"'/\*[^*]*\*/'"
);
$replace=array (
">\\1<",
" ",
"",
"\"",
"\"",
""
);
return preg_replace($pattern, $replace, $string);
}