作者 刘锟

update

  1 +<?php
  2 +
  3 +namespace App\Console\Commands\Update;
  4 +
  5 +use App\Models\Collect\CollectSource;
  6 +use App\Models\Collect\CollectTask;
  7 +use App\Models\Com\UpdateLog;
  8 +use App\Models\RouteMap\RouteMap;
  9 +use App\Services\CosService;
  10 +use App\Services\ProjectServer;
  11 +use Illuminate\Console\Command;
  12 +use Illuminate\Support\Facades\DB;
  13 +use Illuminate\Support\Facades\Redis;
  14 +
  15 +/**
  16 + * 4.0,5.0升级到6.0,主站自定义页面采集
  17 + * Class ProjectImport
  18 + * @package App\Console\Commands
  19 + * @author Akun
  20 + * @date 2023/12/13 14:44
  21 + */
  22 +class HtmlCustomCollect extends Command
  23 +{
  24 + /**
  25 + * The name and signature of the console command.
  26 + *
  27 + * @var string
  28 + */
  29 + protected $signature = 'project_html_custom_collect';
  30 +
  31 + /**
  32 + * The console command description.
  33 + *
  34 + * @var string
  35 + */
  36 + protected $description = '执行项目自定义html页面采集';
  37 +
  38 +
  39 + public function handle()
  40 + {
  41 + ini_set('memory_limit', '512M');
  42 +
  43 + $project_id = 626;
  44 + $project_site = 'v6-m342g.globalso.site';
  45 + $pages = [
  46 + 'https://www.lecusostreetlight.com/project_catalog/project/'
  47 + ];
  48 +
  49 + foreach ($pages as $page) {
  50 + $this->start_collect($page, $project_id, $project_site);
  51 + }
  52 + }
  53 +
  54 + protected function start_collect($page, $project_id, $project_site)
  55 + {
  56 + $page_arr = parse_url($page);
  57 + $domain = $page_arr['host'];
  58 + $path = $page_arr['path'];
  59 +
  60 + //设置数据库
  61 + $project = ProjectServer::useProject($project_id);
  62 + if ($project) {
  63 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', page: ' . $page . ', collect start' . PHP_EOL;
  64 +
  65 + //获取站点原始域名信息
  66 + $old_info = getOldDomain($project_id, $domain);
  67 +
  68 + //采集html页面,下载资源到本地并替换
  69 + try {
  70 + $html = curl_c($page, false);
  71 + if ($html == '0') {
  72 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', page: ' . $page . ', error: no html' . PHP_EOL;
  73 + sleep(2);
  74 + return true;
  75 + }
  76 +
  77 + //如果有base64图片,先替换掉,再进行资源匹配
  78 + $new_html = $html;
  79 + preg_match_all("/data:([^;]*);base64,(.*)?\"/", $new_html, $result_img);
  80 + $img_base64 = $result_img[2] ?? [];
  81 + foreach ($img_base64 as $v64) {
  82 + $new_html = str_replace($v64, '', $new_html);
  83 + }
  84 +
  85 + //匹配资源链接
  86 + $source_list = $this->html_preg($new_html, $project_id, $domain, $old_info['web_url_domain'], $old_info['home_url']);
  87 +
  88 + //下载资源
  89 + if ($source_list) {
  90 + $html = $this->upload_source($html, $source_list, $project_id, $domain, $old_info['web_url_domain'], $old_info['home_url']);
  91 + }
  92 +
  93 + //替换域名
  94 + $html = str_replace($old_info['web_url_domain'], $project_site, $html);
  95 + $html = str_replace($old_info['home_url'], $project_site, $html);
  96 +
  97 + //暂时隐藏小语种
  98 + $html = str_replace('<div class="change-language ensemble">', '<div class="change-language ensemble" style="display: none">', $html);
  99 + $html = str_replace('<div class="language_more">', '<div class="language_more" style="display: none">', $html);
  100 + $html = str_replace('</body>', '<script src="https://ecdn6.globalso.com/public/customerVisit.min.js\"></script></body>', $html);
  101 +
  102 + //html写入文件
  103 + $file_path = '/www/wwwroot/globalso-v6-c-glo/public/' . $project_site . $path;
  104 + $file_path = iconv("UTF-8", "GBK", $file_path);
  105 + if (!file_exists($file_path)) {
  106 + mkdir($file_path, 0777, true);
  107 + }
  108 +
  109 + file_put_contents($file_path . 'index.html', $html);
  110 + chmod($file_path . 'index.html', 0777);
  111 +
  112 + } catch (\Exception $e) {
  113 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', page: ' . $page . ', error: ' . $e->getMessage() . PHP_EOL;
  114 + sleep(2);
  115 + return true;
  116 + }
  117 +
  118 +
  119 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', page: ' . $page . ', collect end' . PHP_EOL;
  120 + } else {
  121 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', page: ' . $page . ', no project' . PHP_EOL;
  122 + }
  123 + //关闭数据库
  124 + DB::disconnect('custom_mysql');
  125 +
  126 + sleep(2);
  127 + return true;
  128 + }
  129 +
  130 + //正则匹配html资源
  131 + protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
  132 + {
  133 + $source = [];
  134 +
  135 + if (!$html) {
  136 + return $source;
  137 + }
  138 +
  139 + //image
  140 + preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
  141 + $img = $result_img[2] ?? [];
  142 + foreach ($img as $vi) {
  143 + $check_vi = $this->url_check($vi, $project_id, $domain, $web_url_domain, $home_url);
  144 + $check_vi && $source[] = $check_vi;
  145 + }
  146 +
  147 + //js
  148 + preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
  149 + $js = $result_js[2] ?? [];
  150 + foreach ($js as $vj) {
  151 + $check_vj = $this->url_check($vj, $project_id, $domain, $web_url_domain, $home_url);
  152 + $check_vj && $source[] = $check_vj;
  153 + }
  154 +
  155 + //video
  156 + preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
  157 + $video = $result_video[2] ?? [];
  158 + foreach ($video as $vv) {
  159 + $check_vv = $this->url_check($vv, $project_id, $domain, $web_url_domain, $home_url);
  160 + $check_vv && $source[] = $check_vv;
  161 + }
  162 +
  163 + //css
  164 + preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
  165 + $css = $result_css[2] ?? [];
  166 + foreach ($css as $vc) {
  167 + $check_vc = $this->url_check($vc, $project_id, $domain, $web_url_domain, $home_url);
  168 + $check_vc && $source[] = $check_vc;
  169 + }
  170 +
  171 + //css background
  172 + preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
  173 + $css_b = $result_css_b[1] ?? [];
  174 + foreach ($css_b as $vc_b) {
  175 + $check_vc_b = $this->url_check($vc_b, $project_id, $domain, $web_url_domain, $home_url);
  176 + $check_vc_b && $source[] = $check_vc_b;
  177 + }
  178 +
  179 + //a标签下载资源
  180 + preg_match_all('/<a\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_a);
  181 + $down = $result_a[2] ?? [];
  182 + foreach ($down as $vd) {
  183 + $check_vd = $this->url_check($vd, $project_id, $domain, $web_url_domain, $home_url);
  184 + $check_vd && $source[] = $check_vd;
  185 + }
  186 +
  187 + return $source;
  188 + }
  189 +
  190 + //判断资源是否需要下载
  191 + protected function url_check($url, $project_id, $domain, $web_url_domain, $home_url)
  192 + {
  193 + if ($url) {
  194 + $url = str_replace('&quot;', '', $url);
  195 + $arr = parse_url($url);
  196 + $scheme = $arr['scheme'] ?? '';
  197 + $host = $arr['host'] ?? '';
  198 + $path = $arr['path'] ?? '';
  199 + $query = $arr['query'] ?? '';
  200 +
  201 + $path_arr = explode('.', $path);
  202 + if (
  203 + (empty($scheme) || $scheme == 'https' || $scheme == 'http')
  204 + && (empty($host) || (strpos($web_url_domain, $host) !== false) || (strpos($home_url, $host) !== false))
  205 + && $path
  206 + && (substr($path, 0, 1) == '/')
  207 + && (strpos($path, '.') !== false)
  208 + && (!in_array(end($path_arr), ['html', 'php', 'com', 'xml']))
  209 + ) {
  210 + $source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
  211 + if (!$source) {
  212 + return [
  213 + 'download' => true,
  214 + 'url' => $url,
  215 + 'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path . ($query ? '?' . $query : '')
  216 + ];
  217 + } else {
  218 + return [
  219 + 'download' => false,
  220 + 'url' => $url,
  221 + 'url_complete' => $source['target']
  222 + ];
  223 + }
  224 + } else {
  225 + return false;
  226 + }
  227 + } else {
  228 + return false;
  229 + }
  230 + }
  231 +
  232 + //下载并替换资源
  233 + protected function upload_source($html, $source, $project_id, $domain, $web_url_domain, $home_url)
  234 + {
  235 + foreach ($source as $vs) {
  236 +
  237 + if ($vs['download']) {
  238 + $new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);
  239 + if ($new_source) {
  240 + CollectSource::insert([
  241 + 'project_id' => $project_id,
  242 + 'origin' => $vs['url'],
  243 + 'target' => $new_source,
  244 + 'created_at' => date('Y-m-d H:i:s'),
  245 + 'updated_at' => date('Y-m-d H:i:s'),
  246 + ]);
  247 + $html = str_replace($vs['url'], getImageUrl($new_source), $html);
  248 +
  249 + if (substr($new_source, -3, 3) == 'css' || substr($new_source, -2, 2) == 'js') {
  250 +
  251 + $source_html = curl_c(getImageUrl($new_source), false);
  252 +
  253 + if (substr($new_source, -3, 3) == 'css') {
  254 + preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $source_html, $result_source);
  255 + } else {
  256 + preg_match_all("/[large|thumb]+URL:['\"]+(\s*[^>]+?)['\"]+,/i", $source_html, $result_source);
  257 + }
  258 +
  259 + $js_css_source = $result_source[1] ?? [];
  260 + if ($js_css_source) {
  261 + foreach ($js_css_source as $vjs) {
  262 + $vjs_down = str_replace('&quot;', '', $vjs);
  263 + if (strpos($vjs_down, 'data:') !== false) {
  264 + //过滤二进制文件
  265 + continue;
  266 + }
  267 + if (strlen($vjs_down) > 255) {
  268 + //过滤太长文件
  269 + continue;
  270 + }
  271 +
  272 + $vjs_down_arr = parse_url($vjs_down);
  273 + $vjs_down_host = $vjs_down_arr['host'] ?? '';
  274 +
  275 + $cos = config('filesystems.disks.cos');
  276 + $cosCdn = $cos['cdn'];
  277 +
  278 + if ($vjs_down_host && $vjs_down_host == $cosCdn) {
  279 + //过滤已经下载的
  280 + continue;
  281 + }
  282 +
  283 + if (empty($vjs_down_host) && substr($vjs_down, 0, 1) != '/') {
  284 + //相对路径
  285 + $url_arr = explode('/', $vs['url']);
  286 + $url_arr[count($url_arr) - 1] = $vjs_down;
  287 + $vjs_down = implode('/', $url_arr);
  288 + }
  289 +
  290 + $vjs_result = $this->url_check($vjs_down, $project_id, $domain, $web_url_domain, $home_url);
  291 + if (!$vjs_result) {
  292 + continue;
  293 + }
  294 +
  295 + if ($vjs_result['download']) {
  296 + $new_vjs = CosService::uploadRemote($project_id, 'source', $vjs_result['url_complete']);
  297 + if ($new_vjs) {
  298 + CollectSource::insert([
  299 + 'project_id' => $project_id,
  300 + 'origin' => $vjs_result['url'],
  301 + 'target' => $new_vjs,
  302 + 'created_at' => date('Y-m-d H:i:s'),
  303 + 'updated_at' => date('Y-m-d H:i:s'),
  304 + ]);
  305 + $source_html = str_replace($vjs, getImageUrl($new_vjs), $source_html);
  306 + }
  307 + } else {
  308 + $source_html = str_replace($vjs, getImageUrl($vjs_result['url_complete']), $source_html);
  309 + }
  310 + }
  311 +
  312 + CosService::uploadRemote($project_id, 'source', $new_source, $new_source, $source_html);
  313 + }
  314 + }
  315 + }
  316 + } else {
  317 + $html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);
  318 + }
  319 + }
  320 +
  321 + return $html;
  322 + }
  323 +}