PHP根据Xpath解析Dom

By | 2022-03-07

DomCrawler

https://symfony.com/doc/current/components/dom_crawler.html

安装

composer require symfony/dom-crawler

使用

require __DIR__ . '/vendor/autoload.php';
use Symfony\Component\DomCrawler\Crawler;
function Spider($url)
{
    $response = get_html($url);

    //进行XPath页面数据抽取
    $data    = []; //结构化数据存本数组
    $crawler = new Crawler();
    $crawler->addHtmlContent($response);

    try {
        $data['Title*'] = $crawler->filterXPath('//*[@id="product-info_productDetail"]/h1')->text();
        $data['Subtitle'] = $crawler->filter('#xxx')->text();


    } catch (\Exception $e) {
        var_dump($e);exit;
    }

    return $data;

}

如何压缩HTML

function compress_html($string){
$string=str_replace("\r\n",'',$string);//清除换行符
$string=str_replace("\n",'',$string);//清除换行符
$string=str_replace("\t",'',$string);//清除制表符
$pattern=array(
"/> *([^ ]*) *</",//去掉注释标记
"/[\s]+/",
"/<!--[^!]*-->/",
"/\" /",
"/ \"/",
"'/\*[^*]*\*/'"
);
$replace=array (
">\\1<",
" ",
"",
"\"",
"\"",
""
);
return preg_replace($pattern, $replace, $string);
}