<?php    
    namespace App\Console\Commands;
    use App\Library\Common;
    use App\Library\KgQuote;
    use App\Models\Prod\ProdContent;
    use GuzzleHttp\Client;
    use Illuminate\Console\Command;
    use Illuminate\Support\Facades\Log;
    use PHPUnit\Exception;
    class Test_Spider extends Command
    {
        /**
         * The name and signature of the console command.
         *
         * @var string
         */
        protected $signature = 'TestSpider';
        /**
         * The console command description.
         *
         * @var string
         */
        protected $description = '测试使用xpath抓取页面';
        /**
         * Create a new command instance.
         *
         * @return void
         */
        public function __construct()
        {
            parent::__construct();
        }
        /**
         * Execute the console command.
         *
         * @return mixed
         */
        public function handle()
        {      
            Log::info('spider1', ['start']);
            for($page = 1; $page<87; $page++){
                $url = 'http://www.biochemsafebuy.com/m/category_detailed/2/'.$page;
                $this->spiderList($url);
                sleep(1);
            }
            Log::info('spider1', ['end']);
        }
        
        
        //爬取列表页面,解析详情地址
        private function spiderList($url){
            Log::info('list_url', [$url]);
            try{           
                $client = new Client();
                $response = $client->get($url);
                $html = $response->getBody();
            }catch (Exception $e){
                Log::info('spiderListException ', [$e]);
                return false;
            }
            
            $dom = new \DOMDocument();
            @$dom->loadHTML($html);
            $dom->normalize();
            $xpath = new \DOMXPath($dom);
         
            $lists = $xpath->query('.//div[contains(@class, "wrap-img")]/a/@href');
            foreach ($lists as $node) {
                $detail_url = $node->nodeValue;
                $format_res = $this->formatUrl($detail_url);
                if(!$format_res){
                    continue;
                }
                $cas = $format_res['cas'];
                $new_detail_url = $format_res['url'];
                
                //判断是否抓过
                $pcontent = ProdContent::where('cas', '=', $cas)->first();
                if($pcontent) continue;
                
                $content = $this->spiderDetail($new_detail_url);
                if(!$content) continue;
                
                ProdContent::create(array(
                     'cas'=>$cas,
                     'url'=>$new_detail_url,
                     'content'=>$content
                ));
                
                Log::info('res', [$format_res]);
                sleep(1);
            }
        }
        
        private function formatUrl($url){
            //判断url格式是否正确
            $items = explode("-", trim($url, '/'));
            if(count($items) != 4){
                return false;
            }
            $detail_url = 'http://www.biochemsafebuy.com'. str_replace('p', 'd', $url);
            $data = array(
                'cas'=>$items[0]."-".$items[1]."-".$items[2],
                'url'=>$detail_url
            );
            return $data;
        }
        
        /**
         * 抓取详情页
         * @param $url
         */
        private function spiderDetail($url){
            //$html = file_get_contents('http://www.biochemsafebuy.com/538-75-0-d497807/');
            try{
                //$html = file_get_contents($url);
                $client = new Client();
                $response = $client->get($url);
                $html = $response->getBody();
            }catch (Exception $e){
                Log::info('spiderDetailException ', [$e]);
                return false;
            }
            $dom = new \DOMDocument();
            @$dom->loadHTML($html);
            $dom->normalize();
            $xpath = new \DOMXPath($dom);
            //$hrefs = $xpath->query("/html/body//a//@href");
            //$details = $xpath->query('.//div[@class="detail_box detail_nature"]/text()');
            $details = $xpath->query('.//div[contains(@class, "detail_box")]/text()');
            $info = '';
            foreach ($details as $node) {
                if(strpos($node->nodeValue, '物理化学性质资料暂无') !== false){
                    continue;
                }
                if(strpos($node->nodeValue, '安全信息资料暂无') !== false){
                    continue;
                }
                if(strpos($node->nodeValue, 'MSDS资料暂无') !== false){
                    continue;
                }
                if(strpos($node->nodeValue, '上下游产品资料暂无') !== false){
                    continue;
                }
                if(strpos($node->nodeValue, '生产用途资料暂无') !== false){
                    continue;
                }
                if(strpos($node->nodeValue, '海关数据资料暂无') !== false){
                    continue;
                }
                if(strpos($node->nodeValue, '合成路线资料暂无') !== false){
                    continue;
                }
                $info .= $node->nodeValue;
            }
            return $info;
        }
    }