HtmlCustomCollect.php 13.0 KB
<?php

namespace App\Console\Commands\Update;

use App\Models\Collect\CollectSource;
use App\Models\Collect\CollectTask;
use App\Models\Com\UpdateLog;
use App\Models\RouteMap\RouteMap;
use App\Services\CosService;
use App\Services\ProjectServer;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Redis;

/**
 * 4.0,5.0升级到6.0,主站自定义页面采集
 * Class ProjectImport
 * @package App\Console\Commands
 * @author Akun
 * @date 2023/12/13 14:44
 */
class HtmlCustomCollect extends Command
{
    /**
     * The name and signature of the console command.
     *
     * @var string
     */
    protected $signature = 'project_html_custom_collect';

    /**
     * The console command description.
     *
     * @var string
     */
    protected $description = '执行项目自定义html页面采集';


    public function handle()
    {
        ini_set('memory_limit', '512M');

        $project_id = 626;
        $project_site = 'v6-m342g.globalso.site';
        $pages = [
            'https://www.lecusostreetlight.com/project_catalog/project/'
        ];

        foreach ($pages as $page) {
            $this->start_collect($page, $project_id, $project_site);
        }
    }

    protected function start_collect($page, $project_id, $project_site)
    {
        $page_arr = parse_url($page);
        $domain = $page_arr['host'];
        $path = $page_arr['path'];

        //设置数据库
        $project = ProjectServer::useProject($project_id);
        if ($project) {
            echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', page: ' . $page . ', collect start' . PHP_EOL;

            //获取站点原始域名信息
            $old_info = getOldDomain($project_id, $domain);

            //采集html页面,下载资源到本地并替换
            try {
                $html = curl_c($page, false);
                if ($html == '0') {
                    echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', page: ' . $page . ', error: no html' . PHP_EOL;
                    sleep(2);
                    return true;
                }

                //如果有base64图片,先替换掉,再进行资源匹配
                $new_html = $html;
                preg_match_all("/data:([^;]*);base64,(.*)?\"/", $new_html, $result_img);
                $img_base64 = $result_img[2] ?? [];
                foreach ($img_base64 as $v64) {
                    $new_html = str_replace($v64, '', $new_html);
                }

                //匹配资源链接
                $source_list = $this->html_preg($new_html, $project_id, $domain, $old_info['web_url_domain'], $old_info['home_url']);

                //下载资源
                if ($source_list) {
                    $html = $this->upload_source($html, $source_list, $project_id, $domain, $old_info['web_url_domain'], $old_info['home_url']);
                }

                //替换域名
                $html = str_replace($old_info['web_url_domain'], $project_site, $html);
                $html = str_replace($old_info['home_url'], $project_site, $html);

                //暂时隐藏小语种
                $html = str_replace('<div class="change-language ensemble">', '<div class="change-language ensemble" style="display: none">', $html);
                $html = str_replace('<div class="language_more">', '<div class="language_more" style="display: none">', $html);
                $html = str_replace('</body>', '<script src="https://ecdn6.globalso.com/public/customerVisit.min.js\"></script></body>', $html);

                //html写入文件
                $file_path = '/www/wwwroot/globalso-v6-c-glo/public/' . $project_site . $path;
                $file_path = iconv("UTF-8", "GBK", $file_path);
                if (!file_exists($file_path)) {
                    mkdir($file_path, 0777, true);
                }

                file_put_contents($file_path . 'index.html', $html);
                chmod($file_path . 'index.html', 0777);

            } catch (\Exception $e) {
                echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', page: ' . $page . ', error: ' . $e->getMessage() . PHP_EOL;
                sleep(2);
                return true;
            }


            echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', page: ' . $page . ', collect end' . PHP_EOL;
        } else {
            echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', page: ' . $page . ', no project' . PHP_EOL;
        }
        //关闭数据库
        DB::disconnect('custom_mysql');

        sleep(2);
        return true;
    }

    //正则匹配html资源
    protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
    {
        $source = [];

        if (!$html) {
            return $source;
        }

        //image
        preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
        $img = $result_img[2] ?? [];
        foreach ($img as $vi) {
            $check_vi = $this->url_check($vi, $project_id, $domain, $web_url_domain, $home_url);
            $check_vi && $source[] = $check_vi;
        }

        //js
        preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
        $js = $result_js[2] ?? [];
        foreach ($js as $vj) {
            $check_vj = $this->url_check($vj, $project_id, $domain, $web_url_domain, $home_url);
            $check_vj && $source[] = $check_vj;
        }

        //video
        preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
        $video = $result_video[2] ?? [];
        foreach ($video as $vv) {
            $check_vv = $this->url_check($vv, $project_id, $domain, $web_url_domain, $home_url);
            $check_vv && $source[] = $check_vv;
        }

        //css
        preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
        $css = $result_css[2] ?? [];
        foreach ($css as $vc) {
            $check_vc = $this->url_check($vc, $project_id, $domain, $web_url_domain, $home_url);
            $check_vc && $source[] = $check_vc;
        }

        //css background
        preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
        $css_b = $result_css_b[1] ?? [];
        foreach ($css_b as $vc_b) {
            $check_vc_b = $this->url_check($vc_b, $project_id, $domain, $web_url_domain, $home_url);
            $check_vc_b && $source[] = $check_vc_b;
        }

        //a标签下载资源
        preg_match_all('/<a\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_a);
        $down = $result_a[2] ?? [];
        foreach ($down as $vd) {
            $check_vd = $this->url_check($vd, $project_id, $domain, $web_url_domain, $home_url);
            $check_vd && $source[] = $check_vd;
        }

        return $source;
    }

    //判断资源是否需要下载
    protected function url_check($url, $project_id, $domain, $web_url_domain, $home_url)
    {
        if ($url) {
            $url = str_replace('&quot;', '', $url);
            $arr = parse_url($url);
            $scheme = $arr['scheme'] ?? '';
            $host = $arr['host'] ?? '';
            $path = $arr['path'] ?? '';
            $query = $arr['query'] ?? '';

            $path_arr = explode('.', $path);
            if (
                (empty($scheme) || $scheme == 'https' || $scheme == 'http')
                && (empty($host) || (strpos($web_url_domain, $host) !== false) || (strpos($home_url, $host) !== false))
                && $path
                && (substr($path, 0, 1) == '/')
                && (strpos($path, '.') !== false)
                && (!in_array(end($path_arr), ['html', 'php', 'com', 'xml']))
            ) {
                $source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
                if (!$source) {
                    return [
                        'download' => true,
                        'url' => $url,
                        'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path . ($query ? '?' . $query : '')
                    ];
                } else {
                    return [
                        'download' => false,
                        'url' => $url,
                        'url_complete' => $source['target']
                    ];
                }
            } else {
                return false;
            }
        } else {
            return false;
        }
    }

    //下载并替换资源
    protected function upload_source($html, $source, $project_id, $domain, $web_url_domain, $home_url)
    {
        foreach ($source as $vs) {

            if ($vs['download']) {
                $new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);
                if ($new_source) {
                    CollectSource::insert([
                        'project_id' => $project_id,
                        'origin' => $vs['url'],
                        'target' => $new_source,
                        'created_at' => date('Y-m-d H:i:s'),
                        'updated_at' => date('Y-m-d H:i:s'),
                    ]);
                    $html = str_replace($vs['url'], getImageUrl($new_source), $html);

                    if (substr($new_source, -3, 3) == 'css' || substr($new_source, -2, 2) == 'js') {

                        $source_html = curl_c(getImageUrl($new_source), false);

                        if (substr($new_source, -3, 3) == 'css') {
                            preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $source_html, $result_source);
                        } else {
                            preg_match_all("/[large|thumb]+URL:['\"]+(\s*[^>]+?)['\"]+,/i", $source_html, $result_source);
                        }

                        $js_css_source = $result_source[1] ?? [];
                        if ($js_css_source) {
                            foreach ($js_css_source as $vjs) {
                                $vjs_down = str_replace('&quot;', '', $vjs);
                                if (strpos($vjs_down, 'data:') !== false) {
                                    //过滤二进制文件
                                    continue;
                                }
                                if (strlen($vjs_down) > 255) {
                                    //过滤太长文件
                                    continue;
                                }

                                $vjs_down_arr = parse_url($vjs_down);
                                $vjs_down_host = $vjs_down_arr['host'] ?? '';

                                $cos = config('filesystems.disks.cos');
                                $cosCdn = $cos['cdn'];

                                if ($vjs_down_host && $vjs_down_host == $cosCdn) {
                                    //过滤已经下载的
                                    continue;
                                }

                                if (empty($vjs_down_host) && substr($vjs_down, 0, 1) != '/') {
                                    //相对路径
                                    $url_arr = explode('/', $vs['url']);
                                    $url_arr[count($url_arr) - 1] = $vjs_down;
                                    $vjs_down = implode('/', $url_arr);
                                }

                                $vjs_result = $this->url_check($vjs_down, $project_id, $domain, $web_url_domain, $home_url);
                                if (!$vjs_result) {
                                    continue;
                                }

                                if ($vjs_result['download']) {
                                    $new_vjs = CosService::uploadRemote($project_id, 'source', $vjs_result['url_complete']);
                                    if ($new_vjs) {
                                        CollectSource::insert([
                                            'project_id' => $project_id,
                                            'origin' => $vjs_result['url'],
                                            'target' => $new_vjs,
                                            'created_at' => date('Y-m-d H:i:s'),
                                            'updated_at' => date('Y-m-d H:i:s'),
                                        ]);
                                        $source_html = str_replace($vjs, getImageUrl($new_vjs), $source_html);
                                    }
                                } else {
                                    $source_html = str_replace($vjs, getImageUrl($vjs_result['url_complete']), $source_html);
                                }
                            }

                            CosService::uploadRemote($project_id, 'source', $new_source, $new_source, $source_html);
                        }
                    }
                }
            } else {
                $html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);
            }
        }

        return $html;
    }
}