<?php
namespace App\Console\Commands;
use App\Library\Common;
use App\Library\KgQuote;
use App\Models\Prod\ProdContent;
use GuzzleHttp\Client;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\Log;
use PHPUnit\Exception;
class Test_Spider extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'TestSpider';
/**
* The console command description.
*
* @var string
*/
protected $description = '测试使用xpath抓取页面';
/**
* Create a new command instance.
*
* @return void
*/
public function __construct()
{
parent::__construct();
}
/**
* Execute the console command.
*
* @return mixed
*/
public function handle()
{
Log::info('spider1', ['start']);
for($page = 1; $page<87; $page++){
$url = 'http://www.biochemsafebuy.com/m/category_detailed/2/'.$page;
$this->spiderList($url);
sleep(1);
}
Log::info('spider1', ['end']);
}
//爬取列表页面,解析详情地址
private function spiderList($url){
Log::info('list_url', [$url]);
try{
$client = new Client();
$response = $client->get($url);
$html = $response->getBody();
}catch (Exception $e){
Log::info('spiderListException ', [$e]);
return false;
}
$dom = new \DOMDocument();
@$dom->loadHTML($html);
$dom->normalize();
$xpath = new \DOMXPath($dom);
$lists = $xpath->query('.//div[contains(@class, "wrap-img")]/a/@href');
foreach ($lists as $node) {
$detail_url = $node->nodeValue;
$format_res = $this->formatUrl($detail_url);
if(!$format_res){
continue;
}
$cas = $format_res['cas'];
$new_detail_url = $format_res['url'];
//判断是否抓过
$pcontent = ProdContent::where('cas', '=', $cas)->first();
if($pcontent) continue;
$content = $this->spiderDetail($new_detail_url);
if(!$content) continue;
ProdContent::create(array(
'cas'=>$cas,
'url'=>$new_detail_url,
'content'=>$content
));
Log::info('res', [$format_res]);
sleep(1);
}
}
private function formatUrl($url){
//判断url格式是否正确
$items = explode("-", trim($url, '/'));
if(count($items) != 4){
return false;
}
$detail_url = 'http://www.biochemsafebuy.com'. str_replace('p', 'd', $url);
$data = array(
'cas'=>$items[0]."-".$items[1]."-".$items[2],
'url'=>$detail_url
);
return $data;
}
/**
* 抓取详情页
* @param $url
*/
private function spiderDetail($url){
//$html = file_get_contents('http://www.biochemsafebuy.com/538-75-0-d497807/');
try{
//$html = file_get_contents($url);
$client = new Client();
$response = $client->get($url);
$html = $response->getBody();
}catch (Exception $e){
Log::info('spiderDetailException ', [$e]);
return false;
}
$dom = new \DOMDocument();
@$dom->loadHTML($html);
$dom->normalize();
$xpath = new \DOMXPath($dom);
//$hrefs = $xpath->query("/html/body//a//@href");
//$details = $xpath->query('.//div[@class="detail_box detail_nature"]/text()');
$details = $xpath->query('.//div[contains(@class, "detail_box")]/text()');
$info = '';
foreach ($details as $node) {
if(strpos($node->nodeValue, '物理化学性质资料暂无') !== false){
continue;
}
if(strpos($node->nodeValue, '安全信息资料暂无') !== false){
continue;
}
if(strpos($node->nodeValue, 'MSDS资料暂无') !== false){
continue;
}
if(strpos($node->nodeValue, '上下游产品资料暂无') !== false){
continue;
}
if(strpos($node->nodeValue, '生产用途资料暂无') !== false){
continue;
}
if(strpos($node->nodeValue, '海关数据资料暂无') !== false){
continue;
}
if(strpos($node->nodeValue, '合成路线资料暂无') !== false){
continue;
}
$info .= $node->nodeValue;
}
return $info;
}
}