HtmlLanguageCollect.php 13.2 KB
<?php

namespace App\Console\Commands\Update;

use App\Models\Collect\CollectSource;
use App\Models\Collect\CollectTask;
use App\Models\Com\UpdateLog;
use App\Models\RouteMap\RouteMap;
use App\Services\CosService;
use App\Services\ProjectServer;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Redis;

/**
 * 4.0,5.0升级到6.0,小语种页面采集
 * Class ProjectImport
 * @package App\Console\Commands
 * @author Akun
 * @date 2023/11/20 14:04
 */
class HtmlLanguageCollect extends Command
{
    /**
     * The name and signature of the console command.
     *
     * @var string
     */
    protected $signature = 'project_html_language_collect';

    /**
     * The console command description.
     *
     * @var string
     */
    protected $description = '执行项目html页面采集';


    public function handle()
    {
        while (true) {
            $this->start_collect();
        }
    }

    protected function start_collect()
    {
        $task_id = $this->get_task();
        if ($task_id === false) {
            //所有项目采集完成
            sleep(60);
            return true;
        } elseif ($task_id === 0) {
            //当前项目采集完成
            sleep(2);
            return true;
        }

        $task_arr = explode('_', $task_id);
        $project_id = $task_arr[0];
        $collect_id = $task_arr[1];

        //设置数据库
        $project = ProjectServer::useProject($project_id);
        if ($project) {
            $collect_info = CollectTask::select(['id', 'domain', 'route', 'language'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->where('language', '!=', '')->first();

            if (!$collect_info) {
                sleep(2);
                return true;
            }

            echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect start' . PHP_EOL;

            $collect_info->status = CollectTask::STATUS_ING;
            $collect_info->save();

            //获取英文站域名
            $domain = $collect_info->domain;
            if (strpos($domain, '/') !== false) {
                $domain = substr($domain, 0, strpos($domain, '/'));
            } else {
                $domain = str_replace($collect_info->language, 'www', $domain);
            }

            $web_url_domain = $domain;
            $home_url = $domain;
            $url_web_config = 'https://' . $domain . '/wp-content/cache/user_config.text';
            $data_config = curl_c($url_web_config);
            if ($data_config) {
                $web_url_arr = parse_url($data_config['web_url_domain'] ?? '');
                if (isset($web_url_arr['host'])) {
                    $web_url_domain = $web_url_arr['host'];
                }

                $home_url_arr = parse_url($data_config['home_url'] ?? '');
                if (isset($home_url_arr['host'])) {
                    $home_url = $home_url_arr['host'];
                }
            }

            //采集html页面,下载资源到本地并替换
            try {
                $html = curl_c('https://' . $collect_info->domain . $collect_info->route, false);
                if($html == '0'){
                    $collect_info->status = CollectTask::STATUS_FAIL;
                    $collect_info->save();
                    echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: no html'  . PHP_EOL;
                    sleep(2);
                    return true;
                }

                $source_list = $this->html_preg($html, $project_id, $collect_info->domain, $web_url_domain, $home_url);

                if ($source_list) {
                    $html = $this->upload_source($html, $source_list, $project_id);
                }
            } catch (\Exception $e) {
                $collect_info->status = CollectTask::STATUS_FAIL;
                $collect_info->save();

                echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;
                sleep(2);
                return true;
            }

            $collect_info->html = $html;
            $collect_info->status = CollectTask::STATUS_COM;
            $collect_info->save();

            echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect end' . PHP_EOL;
        }
        //关闭数据库
        DB::disconnect('custom_mysql');

        sleep(2);
        return true;
    }

    //获取任务
    protected function get_task()
    {
        $key = 'console_html_language_collect_task';
        $task_id = Redis::rpop($key);
        if ($task_id) {
            return $task_id;
        }


        $update_log = UpdateLog::where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_MAIN)->orderBy('project_id', 'asc')->first();
        if (!$update_log) {
            return false;
        }

        switch ($update_log->api_type) {
            case 'page':
                $source = RouteMap::SOURCE_PAGE;
                break;
            case 'news':
                $source = RouteMap::SOURCE_NEWS;
                break;
            case 'blog':
                $source = RouteMap::SOURCE_BLOG;
                break;
            case 'tag':
                $source = RouteMap::SOURCE_PRODUCT_KEYWORD;
                break;
            default:
                $source = RouteMap::SOURCE_PRODUCT;
                break;
        }

        $complete = false;
        //设置数据库
        $project = ProjectServer::useProject($update_log->project_id);
        if ($project) {
            $collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('source', $source)->where('language', '!=', '')->where('status', CollectTask::STATUS_UN)->orderBy('id', 'asc')->limit(50)->get();

            if ($collect_list->count() == 0) {
                $complete = true;
            } else {
                foreach ($collect_list as $collect) {
                    Redis::lpush($key, $collect['project_id'] . '_' . $collect['id']);
                }
            }
        }
        //关闭数据库
        DB::disconnect('custom_mysql');

        if ($complete) {
            $update_log->collect_status = UpdateLog::COLLECT_STATUS_COM;
            $update_log->save();
            return 0;
        }

        $task_id = Redis::rpop($key);
        return $task_id;
    }

    //正则匹配html资源
    protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
    {
        $source = [];

        if (!$html) {
            return $source;
        }

        //image
        preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
        $img = $result_img[2] ?? [];
        foreach ($img as $vi) {
            $check_vi = $this->url_check($vi, $project_id, $domain, $web_url_domain, $home_url);
            $check_vi && $source[] = $check_vi;
        }

        //js
        preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
        $js = $result_js[2] ?? [];
        foreach ($js as $vj) {
            $check_vj = $this->url_check($vj, $project_id, $domain, $web_url_domain, $home_url);
            $check_vj && $source[] = $check_vj;
        }

        //video
        preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
        $video = $result_video[2] ?? [];
        foreach ($video as $vv) {
            $check_vv = $this->url_check($vv, $project_id, $domain, $web_url_domain, $home_url);
            $check_vv && $source[] = $check_vv;
        }

        //css
        preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
        $css = $result_css[2] ?? [];
        foreach ($css as $vc) {
            $check_vc = $this->url_check($vc, $project_id, $domain, $web_url_domain, $home_url);
            $check_vc && $source[] = $check_vc;
        }

        //css background
        preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
        $css_b = $result_css_b[1] ?? [];
        foreach ($css_b as $vc_b) {
            $check_vc_b = $this->url_check($vc_b, $project_id, $domain, $web_url_domain, $home_url);
            $check_vc_b && $source[] = $check_vc_b;
        }


        return $source;
    }

    //判断资源是否需要下载
    protected function url_check($url, $project_id, $domain, $web_url_domain, $home_url)
    {
        if ($url) {
            $arr = parse_url($url);
            $scheme = $arr['scheme'] ?? '';
            $host = $arr['host'] ?? '';
            $path = $arr['path'] ?? '';
            $query = $arr['query'] ?? '';

            $path_arr = explode('.', $path);
            if (
                (empty($host) || $host == $web_url_domain || $host == $home_url)
                && $path
                && (strpos($path, '.') !== false)
                && (end($path_arr) != 'html')
            ) {
                $source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
                if (!$source) {
                    return [
                        'download' => true,
                        'url' => $url,
                        'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path . ($query ? '?' . $query : '')
                    ];
                } else {
                    return [
                        'download' => false,
                        'url' => $url,
                        'url_complete' => $source['target']
                    ];
                }
            } else {
                return false;
            }
        } else {
            return false;
        }
    }

    //下载并替换资源
    protected function upload_source($html, $source, $project_id)
    {
        foreach ($source as $vs) {

            if ($vs['download']) {
                $new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);
                if ($new_source) {
                    CollectSource::insert([
                        'project_id' => $project_id,
                        'origin' => $vs['url'],
                        'target' => $new_source,
                        'created_at' => date('Y-m-d H:i:s'),
                        'updated_at' => date('Y-m-d H:i:s'),
                    ]);
                    $html = str_replace($vs['url'], getImageUrl($new_source), $html);

                    if (substr($new_source, -3, 3) == 'css') {
                        // 下载css文件中的资源
                        $css_html = curl_c($vs['url_complete'], false);
                        preg_match_all("/url\(['\"](\s*[^>]+?)['\"]\)/i", $css_html, $result_css_source);
                        $css_source = $result_css_source[1] ?? [];

                        $url_arr = explode('/', $vs['url_complete']);
                        $target_arr = explode('/', $new_source);
                        foreach ($css_source as $vcs) {
                            $vcs_arr = parse_url($vcs);
                            if (isset($vcs_arr['domain'])) {
                                //不是相对路径,不下载
                                continue;
                            }

                            $vcs = $vcs_arr['path'] ?? '';
                            if (!$vcs) {
                                continue;
                            }
                            if (strpos($vcs, '.') === false) {
                                continue;
                            }
                            $path_arr = explode('.', $vcs);
                            if(end($path_arr) == 'html'){
                                continue;
                            }

                            $source_info = CollectSource::where('project_id', $project_id)->where('origin', $vcs)->first();
                            if ($source_info) {
                                //已存在,不下载
                                continue;
                            }

                            $url_arr[count($url_arr) - 1] = $vcs;
                            $url_css_complete = implode('/', $url_arr);
                            $target_arr[count($target_arr) - 1] = $vcs;
                            $path = implode('/', $target_arr);

                            $new_source_css = CosService::uploadRemote($project_id, 'source', $url_css_complete, $path);
                            if ($new_source_css) {
                                CollectSource::insert([
                                    'project_id' => $project_id,
                                    'origin' => $vcs,
                                    'target' => $new_source_css,
                                    'created_at' => date('Y-m-d H:i:s'),
                                    'updated_at' => date('Y-m-d H:i:s'),
                                ]);
                            }
                        }
                    }
                }
            } else {
                $html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);
            }
        }

        return $html;
    }
}