作者 lyh
@@ -30,12 +30,17 @@ class Temp extends Command @@ -30,12 +30,17 @@ class Temp extends Command
30 30
31 public function handle() 31 public function handle()
32 { 32 {
33 - $data = UpdateLog::where('api_type','website_info')->get();  
34 -  
35 - foreach ($data as $item) {  
36 - UpdateVisit::createLog($item->project_id,$item->api_url);  
37 -  
38 - echo $item->project_id . '成功' . PHP_EOL; 33 + $project = ProjectServer::useProject(626);
  34 + if ($project) {
  35 + CollectTask::select(['id', 'language'])->where('status', 0)->chunk(1000, function ($query) {
  36 +
  37 + foreach ($query as $item) {
  38 + $item->domain = 'lecusostreetlight.quanqiusou.cn/' . $item->language;
  39 + $item->save();
  40 + }
  41 + });
39 } 42 }
  43 + DB::disconnect('custom_mysql');
  44 + echo '成功' . PHP_EOL;
40 } 45 }
41 } 46 }
  1 +<?php
  2 +
  3 +namespace App\Console\Commands\Update;
  4 +
  5 +use App\Models\Collect\CollectSource;
  6 +use App\Models\Collect\CollectTask;
  7 +use App\Models\Com\UpdateLog;
  8 +use App\Models\Com\UpdateOldInfo;
  9 +use App\Models\RouteMap\RouteMap;
  10 +use App\Services\CosService;
  11 +use App\Services\ProjectServer;
  12 +use Illuminate\Console\Command;
  13 +use Illuminate\Support\Facades\Cache;
  14 +use Illuminate\Support\Facades\DB;
  15 +use Illuminate\Support\Facades\Redis;
  16 +
  17 +/**
  18 + * 4.0,5.0升级到6.0,小语种页面采集
  19 + * Class ProjectImport
  20 + * @package App\Console\Commands
  21 + * @author Akun
  22 + * @date 2023/11/20 14:04
  23 + */
  24 +class HtmlLanguageSpecialCollect extends Command
  25 +{
  26 + /**
  27 + * The name and signature of the console command.
  28 + *
  29 + * @var string
  30 + */
  31 + protected $signature = 'project_html_language_special_collect';
  32 +
  33 + /**
  34 + * The console command description.
  35 + *
  36 + * @var string
  37 + */
  38 + protected $description = '执行项目html页面采集';
  39 +
  40 +
  41 + public function handle()
  42 + {
  43 + ini_set('memory_limit', '512M');
  44 + while (true) {
  45 + $this->start_collect();
  46 + }
  47 + }
  48 +
  49 + protected function start_collect()
  50 + {
  51 + $task_id = $this->get_task();
  52 + if ($task_id === false) {
  53 + //所有项目采集完成
  54 + sleep(60);
  55 + return true;
  56 + } elseif ($task_id === 0) {
  57 + //当前项目采集完成
  58 + sleep(2);
  59 + return true;
  60 + }
  61 +
  62 + $task_arr = explode('_', $task_id);
  63 + $project_id = $task_arr[0];
  64 + $collect_id = $task_arr[1];
  65 +
  66 + //设置数据库
  67 + $project = ProjectServer::useProject($project_id);
  68 + if ($project) {
  69 + $collect_info = CollectTask::select(['id', 'domain', 'route', 'language'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->where('language', '!=', '')->first();
  70 +
  71 + if (!$collect_info) {
  72 + sleep(2);
  73 + return true;
  74 + }
  75 +
  76 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect start' . PHP_EOL;
  77 +
  78 + $collect_info->status = CollectTask::STATUS_ING;
  79 + $collect_info->save();
  80 +
  81 + //获取站点正式和测试域名
  82 + $domain_en = $this->get_domain_en($project_id);
  83 + $old_info = UpdateOldInfo::getOldDomain($project_id, $domain_en);
  84 +
  85 + //采集html页面,下载资源到本地并替换
  86 + try {
  87 + $html = curl_c('https://' . $collect_info->domain . $collect_info->route, false);
  88 + if ($html == '0') {
  89 + $collect_info->status = CollectTask::STATUS_FAIL;
  90 + $collect_info->save();
  91 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: no html' . PHP_EOL;
  92 + sleep(2);
  93 + return true;
  94 + }
  95 +
  96 + //如果有base64图片,先替换掉,再进行资源匹配
  97 + $new_html = $html;
  98 + preg_match_all("/data:([^;]*);base64,(.*)?\"/", $new_html, $result_img);
  99 + $img_base64 = $result_img[2] ?? [];
  100 + foreach ($img_base64 as $v64) {
  101 + $new_html = str_replace($v64, '', $new_html);
  102 + }
  103 +
  104 + $source_list = $this->html_preg($new_html, $project_id, $domain_en, $old_info['web_url_domain'], $old_info['home_url']);
  105 +
  106 + if ($source_list) {
  107 + $html = $this->upload_source($html, $source_list, $project_id, $domain_en, $old_info['web_url_domain'], $old_info['home_url']);
  108 + }
  109 + } catch (\Exception $e) {
  110 + $collect_info->status = CollectTask::STATUS_FAIL;
  111 + $collect_info->save();
  112 +
  113 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;
  114 + sleep(2);
  115 + return true;
  116 + }
  117 +
  118 + $collect_info->html = $html;
  119 + $collect_info->status = CollectTask::STATUS_COM;
  120 + $collect_info->save();
  121 +
  122 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect end' . PHP_EOL;
  123 + }
  124 + //关闭数据库
  125 + DB::disconnect('custom_mysql');
  126 +
  127 + sleep(2);
  128 + return true;
  129 + }
  130 +
  131 + //获取任务
  132 + protected function get_task()
  133 + {
  134 + $key = 'console_html_language_special_collect_task';
  135 + $task_id = Redis::rpop($key);
  136 + if ($task_id) {
  137 + return $task_id;
  138 + }
  139 +
  140 +
  141 + $update_log = UpdateLog::where('status', UpdateLog::STATUS_COM)->where('collect_status', '>', UpdateLog::COLLECT_STATUS_MAIN)->orderBy('collect_status', 'asc')->first();
  142 + if (!$update_log) {
  143 + return false;
  144 + }
  145 +
  146 + switch ($update_log->api_type) {
  147 + case 'page':
  148 + $source = RouteMap::SOURCE_PAGE;
  149 + break;
  150 + case 'news':
  151 + $source = RouteMap::SOURCE_NEWS;
  152 + break;
  153 + case 'blog':
  154 + $source = RouteMap::SOURCE_BLOG;
  155 + break;
  156 + default:
  157 + $source = RouteMap::SOURCE_PRODUCT;
  158 + break;
  159 + }
  160 +
  161 + $complete = false;
  162 + //设置数据库
  163 + $project = ProjectServer::useProject($update_log->project_id);
  164 + if ($project) {
  165 + $collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('source', $source)->where('language', '!=', '')->where('status', CollectTask::STATUS_UN)->orderBy('id', 'asc')->limit(50)->get();
  166 +
  167 + if ($collect_list->count() == 0) {
  168 + $complete = true;
  169 + } else {
  170 + foreach ($collect_list as $collect) {
  171 + Redis::lpush($key, $collect['project_id'] . '_' . $collect['id']);
  172 + }
  173 + }
  174 + }
  175 + //关闭数据库
  176 + DB::disconnect('custom_mysql');
  177 +
  178 + if ($complete) {
  179 + $update_log->collect_status = UpdateLog::COLLECT_STATUS_COM;
  180 + $update_log->save();
  181 + return 0;
  182 + }
  183 +
  184 + $task_id = Redis::rpop($key);
  185 + return $task_id;
  186 + }
  187 +
  188 + //获取英文站域名
  189 + protected function get_domain_en($project_id)
  190 + {
  191 + $key = 'console_html_language_domain_en';
  192 + $domain = Cache::get($key);
  193 + if (!$domain) {
  194 + $domain = CollectTask::where('project_id', $project_id)->where('language', '')->value('domain');
  195 +
  196 + Cache::add($key, $domain, 3600);
  197 + }
  198 +
  199 + return $domain;
  200 + }
  201 +
  202 + //正则匹配html资源
  203 + protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
  204 + {
  205 + $source = [];
  206 +
  207 + if (!$html) {
  208 + return $source;
  209 + }
  210 +
  211 + //image
  212 + preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
  213 + $img = $result_img[2] ?? [];
  214 + foreach ($img as $vi) {
  215 + $check_vi = $this->url_check($vi, $project_id, $domain, $web_url_domain, $home_url);
  216 + if ($check_vi && (!in_array($check_vi, $source))) {
  217 + $check_vi && $source[] = $check_vi;
  218 + }
  219 + }
  220 +
  221 + //js
  222 + preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
  223 + $js = $result_js[2] ?? [];
  224 + foreach ($js as $vj) {
  225 + $check_vj = $this->url_check($vj, $project_id, $domain, $web_url_domain, $home_url);
  226 + if ($check_vj && (!in_array($check_vj, $source))) {
  227 + $check_vj && $source[] = $check_vj;
  228 + }
  229 + }
  230 +
  231 + //video
  232 + preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
  233 + $video = $result_video[2] ?? [];
  234 + foreach ($video as $vv) {
  235 + $check_vv = $this->url_check($vv, $project_id, $domain, $web_url_domain, $home_url);
  236 + if ($check_vv && (!in_array($check_vv, $source))) {
  237 + $check_vv && $source[] = $check_vv;
  238 + }
  239 + }
  240 +
  241 + //css
  242 + preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
  243 + $css = $result_css[2] ?? [];
  244 + foreach ($css as $vc) {
  245 + $check_vc = $this->url_check($vc, $project_id, $domain, $web_url_domain, $home_url);
  246 + if ($check_vc && (!in_array($check_vc, $source))) {
  247 + $check_vc && $source[] = $check_vc;
  248 + }
  249 + }
  250 +
  251 + //css background
  252 + preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
  253 + $css_b = $result_css_b[1] ?? [];
  254 + foreach ($css_b as $vc_b) {
  255 + $check_vc_b = $this->url_check($vc_b, $project_id, $domain, $web_url_domain, $home_url);
  256 + if ($check_vc_b && (!in_array($check_vc_b, $source))) {
  257 + $check_vc_b && $source[] = $check_vc_b;
  258 + }
  259 + }
  260 +
  261 + //a标签下载资源
  262 + preg_match_all('/<a\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_a);
  263 + $down = $result_a[2] ?? [];
  264 + foreach ($down as $vd) {
  265 + $check_vd = $this->url_check($vd, $project_id, $domain, $web_url_domain, $home_url);
  266 + if ($check_vd && (!in_array($check_vd, $source))) {
  267 + $check_vd && $source[] = $check_vd;
  268 + }
  269 + }
  270 +
  271 + return $source;
  272 + }
  273 +
  274 + //判断资源是否需要下载
  275 + protected function url_check($url, $project_id, $domain, $web_url_domain, $home_url)
  276 + {
  277 + if ($url) {
  278 + $url = str_replace('&quot;', '', $url);
  279 + $arr = parse_url($url);
  280 + $scheme = $arr['scheme'] ?? '';
  281 + $host = $arr['host'] ?? '';
  282 + $path = $arr['path'] ?? '';
  283 + $query = $arr['query'] ?? '';
  284 +
  285 + $path_arr = explode('.', $path);
  286 + $path_end = end($path_arr);
  287 + if (
  288 + (empty($scheme) || $scheme == 'https' || $scheme == 'http')
  289 + && (empty($host) || (strpos($web_url_domain, $host) !== false) || (strpos($home_url, $host) !== false))
  290 + && $path
  291 + && (substr($path, 0, 1) == '/')
  292 + && (strpos($path, '.') !== false)
  293 + && (strpos($path_end, 'html') === false)
  294 + && (strpos($path_end, 'php') === false)
  295 + && (strpos($path_end, 'com') === false)
  296 + && (strpos($path_end, 'xml') === false)
  297 + ) {
  298 + $source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
  299 + if (!$source) {
  300 + return [
  301 + 'download' => true,
  302 + 'url' => $url,
  303 + 'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path . ($query ? '?' . $query : '')
  304 + ];
  305 + } else {
  306 + return [
  307 + 'download' => false,
  308 + 'url' => $url,
  309 + 'url_complete' => $source['target']
  310 + ];
  311 + }
  312 + } else {
  313 + return false;
  314 + }
  315 + } else {
  316 + return false;
  317 + }
  318 + }
  319 +
  320 + //下载并替换资源
  321 + protected function upload_source($html, $source, $project_id, $domain, $web_url_domain, $home_url)
  322 + {
  323 + foreach ($source as $vs) {
  324 +
  325 + if ($vs['download']) {
  326 + $new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);
  327 + if ($new_source) {
  328 + CollectSource::insert([
  329 + 'project_id' => $project_id,
  330 + 'origin' => $vs['url'],
  331 + 'target' => $new_source,
  332 + 'created_at' => date('Y-m-d H:i:s'),
  333 + 'updated_at' => date('Y-m-d H:i:s'),
  334 + ]);
  335 + $html = str_replace($vs['url'], getImageUrl($new_source), $html);
  336 +
  337 + if (substr($new_source, -3, 3) == 'css' || substr($new_source, -2, 2) == 'js') {
  338 +
  339 + $source_html = curl_c(getImageUrl($new_source), false);
  340 +
  341 + if (substr($new_source, -3, 3) == 'css') {
  342 + preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $source_html, $result_source);
  343 + } else {
  344 + preg_match_all("/[large|thumb]+URL:['\"]+(\s*[^>]+?)['\"]+,/i", $source_html, $result_source);
  345 + }
  346 +
  347 + $js_css_source = $result_source[1] ?? [];
  348 + if ($js_css_source) {
  349 + foreach ($js_css_source as $vjs) {
  350 + if (strpos($vjs, 'URL:"') !== false) {
  351 + $vjs = substr($vjs, strpos($vjs, 'URL:"') + 5);
  352 + }
  353 +
  354 + $vjs_down = str_replace('&quot;', '', $vjs);
  355 + if (strpos($vjs_down, 'data:') !== false) {
  356 + //过滤二进制文件
  357 + continue;
  358 + }
  359 + if (strlen($vjs_down) > 255) {
  360 + //过滤太长文件
  361 + continue;
  362 + }
  363 +
  364 + $vjs_down_arr = parse_url($vjs_down);
  365 + $vjs_down_host = $vjs_down_arr['host'] ?? '';
  366 +
  367 + $cos = config('filesystems.disks.cos');
  368 + $cosCdn = $cos['cdn'];
  369 +
  370 + if ($vjs_down_host && $vjs_down_host == $cosCdn) {
  371 + //过滤已经下载的
  372 + continue;
  373 + }
  374 +
  375 + if (empty($vjs_down_host) && substr($vjs_down, 0, 1) != '/') {
  376 + //相对路径
  377 + $url_arr = explode('/', $vs['url']);
  378 + $url_arr[count($url_arr) - 1] = $vjs_down;
  379 + $vjs_down = implode('/', $url_arr);
  380 + }
  381 +
  382 + $vjs_result = $this->url_check($vjs_down, $project_id, $domain, $web_url_domain, $home_url);
  383 + if (!$vjs_result) {
  384 + continue;
  385 + }
  386 +
  387 + if ($vjs_result['download']) {
  388 + $new_vjs = CosService::uploadRemote($project_id, 'source', $vjs_result['url_complete']);
  389 + if ($new_vjs) {
  390 + CollectSource::insert([
  391 + 'project_id' => $project_id,
  392 + 'origin' => $vjs_result['url'],
  393 + 'target' => $new_vjs,
  394 + 'created_at' => date('Y-m-d H:i:s'),
  395 + 'updated_at' => date('Y-m-d H:i:s'),
  396 + ]);
  397 + $source_html = str_replace($vjs, getImageUrl($new_vjs), $source_html);
  398 + }
  399 + } else {
  400 + $source_html = str_replace($vjs, getImageUrl($vjs_result['url_complete']), $source_html);
  401 + }
  402 + }
  403 +
  404 + CosService::uploadRemote($project_id, 'source', $new_source, $new_source, $source_html);
  405 + }
  406 + }
  407 + }
  408 + } else {
  409 + $html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);
  410 + }
  411 + }
  412 +
  413 + return $html;
  414 + }
  415 +}