作者 刘锟

update

@@ -13,7 +13,7 @@ use Illuminate\Support\Facades\DB; @@ -13,7 +13,7 @@ use Illuminate\Support\Facades\DB;
13 use Illuminate\Support\Facades\Redis; 13 use Illuminate\Support\Facades\Redis;
14 14
15 /** 15 /**
16 - * 4.0,5.0升级到6.0,页面采集 16 + * 4.0,5.0升级到6.0,主站页面采集
17 * Class ProjectImport 17 * Class ProjectImport
18 * @package App\Console\Commands 18 * @package App\Console\Commands
19 * @author Akun 19 * @author Akun
@@ -39,7 +39,7 @@ class HtmlCollect extends Command @@ -39,7 +39,7 @@ class HtmlCollect extends Command
39 public function handle() 39 public function handle()
40 { 40 {
41 while (true) { 41 while (true) {
42 - $this->start_collect(); 42 + $this->start_collect();
43 } 43 }
44 } 44 }
45 45
@@ -137,7 +137,7 @@ class HtmlCollect extends Command @@ -137,7 +137,7 @@ class HtmlCollect extends Command
137 //设置数据库 137 //设置数据库
138 $project = ProjectServer::useProject($update_log->project_id); 138 $project = ProjectServer::useProject($update_log->project_id);
139 if ($project) { 139 if ($project) {
140 - $collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('source', $source)->where('status', CollectTask::STATUS_UN)->orderBy('language', 'asc')->limit(50)->get(); 140 + $collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('source', $source)->where('language', '')->where('status', CollectTask::STATUS_UN)->orderBy('language', 'asc')->limit(50)->get();
141 141
142 if ($collect_list->count() == 0) { 142 if ($collect_list->count() == 0) {
143 $complete = true; 143 $complete = true;
@@ -151,7 +151,7 @@ class HtmlCollect extends Command @@ -151,7 +151,7 @@ class HtmlCollect extends Command
151 DB::disconnect('custom_mysql'); 151 DB::disconnect('custom_mysql');
152 152
153 if ($complete) { 153 if ($complete) {
154 - $update_log->collect_status = UpdateLog::COLLECT_STATUS_COM; 154 + $update_log->collect_status = UpdateLog::COLLECT_STATUS_MAIN;
155 return 0; 155 return 0;
156 } 156 }
157 157
  1 +<?php
  2 +
  3 +namespace App\Console\Commands\Update;
  4 +
  5 +use App\Models\Collect\CollectSource;
  6 +use App\Models\Collect\CollectTask;
  7 +use App\Models\Com\UpdateLog;
  8 +use App\Models\RouteMap\RouteMap;
  9 +use App\Services\CosService;
  10 +use App\Services\ProjectServer;
  11 +use Illuminate\Console\Command;
  12 +use Illuminate\Support\Facades\DB;
  13 +use Illuminate\Support\Facades\Redis;
  14 +
  15 +/**
  16 + * 4.0,5.0升级到6.0,小语种页面采集
  17 + * Class ProjectImport
  18 + * @package App\Console\Commands
  19 + * @author Akun
  20 + * @date 2023/11/20 14:04
  21 + */
  22 +class HtmlLanguageCollect extends Command
  23 +{
  24 + /**
  25 + * The name and signature of the console command.
  26 + *
  27 + * @var string
  28 + */
  29 + protected $signature = 'project_html_language_collect';
  30 +
  31 + /**
  32 + * The console command description.
  33 + *
  34 + * @var string
  35 + */
  36 + protected $description = '执行项目html页面采集';
  37 +
  38 +
  39 + public function handle()
  40 + {
  41 + while (true) {
  42 + $this->start_collect();
  43 + }
  44 + }
  45 +
  46 + protected function start_collect()
  47 + {
  48 + $task_id = $this->get_task();
  49 + if ($task_id === false) {
  50 + //所有项目采集完成
  51 + sleep(60);
  52 + return true;
  53 + } elseif ($task_id === 0) {
  54 + //当前项目采集完成
  55 + sleep(2);
  56 + return true;
  57 + }
  58 +
  59 + $task_arr = explode('_', $task_id);
  60 + $project_id = $task_arr[0];
  61 + $collect_id = $task_arr[1];
  62 +
  63 + //设置数据库
  64 + $project = ProjectServer::useProject($project_id);
  65 + if ($project) {
  66 + $collect_info = CollectTask::select(['id', 'domain', 'route'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->first();
  67 +
  68 + if (!$collect_info) {
  69 + sleep(2);
  70 + return true;
  71 + }
  72 +
  73 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect start' . PHP_EOL;
  74 +
  75 + $collect_info->status = CollectTask::STATUS_ING;
  76 + $collect_info->save();
  77 +
  78 + //采集html页面,下载资源到本地并替换
  79 + try {
  80 + $html = file_get_contents('https://' . $collect_info->domain . $collect_info->route);
  81 + $source_list = $this->html_preg($html, $project_id, $collect_info->domain);
  82 +
  83 + if ($source_list) {
  84 + $html = $this->upload_source($html, $source_list, $project_id);
  85 + }
  86 + } catch (\Exception $e) {
  87 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;
  88 + return true;
  89 + }
  90 +
  91 + $collect_info->html = $html;
  92 + $collect_info->status = CollectTask::STATUS_COM;
  93 + $collect_info->save();
  94 +
  95 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect end' . PHP_EOL;
  96 + }
  97 + //关闭数据库
  98 + DB::disconnect('custom_mysql');
  99 +
  100 + sleep(2);
  101 + }
  102 +
  103 + //获取任务
  104 + protected function get_task()
  105 + {
  106 + $key = 'console_html_language_collect_task';
  107 + $task_id = Redis::rpop($key);
  108 + if ($task_id) {
  109 + return $task_id;
  110 + }
  111 +
  112 +
  113 + $update_log = UpdateLog::where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_MAIN)->orderBy('project_id', 'asc')->first();
  114 + if (!$update_log) {
  115 + return false;
  116 + }
  117 +
  118 + switch ($update_log->api_type) {
  119 + case 'page':
  120 + $source = RouteMap::SOURCE_PAGE;
  121 + break;
  122 + case 'news':
  123 + $source = RouteMap::SOURCE_NEWS;
  124 + break;
  125 + case 'blog':
  126 + $source = RouteMap::SOURCE_BLOG;
  127 + break;
  128 + case 'tag':
  129 + $source = RouteMap::SOURCE_PRODUCT_KEYWORD;
  130 + break;
  131 + default:
  132 + $source = RouteMap::SOURCE_PRODUCT;
  133 + break;
  134 + }
  135 +
  136 + $complete = false;
  137 + //设置数据库
  138 + $project = ProjectServer::useProject($update_log->project_id);
  139 + if ($project) {
  140 + $collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('source', $source)->where('language', '!=', '')->where('status', CollectTask::STATUS_UN)->orderBy('language', 'asc')->limit(50)->get();
  141 +
  142 + if ($collect_list->count() == 0) {
  143 + $complete = true;
  144 + } else {
  145 + foreach ($collect_list as $collect) {
  146 + Redis::lpush($key, $collect['project_id'] . '_' . $collect['id']);
  147 + }
  148 + }
  149 + }
  150 + //关闭数据库
  151 + DB::disconnect('custom_mysql');
  152 +
  153 + if ($complete) {
  154 + $update_log->collect_status = UpdateLog::COLLECT_STATUS_COM;
  155 + return 0;
  156 + }
  157 +
  158 + $task_id = Redis::rpop($key);
  159 + return $task_id;
  160 + }
  161 +
  162 + //正则匹配html资源
  163 + protected function html_preg($html, $project_id, $domain)
  164 + {
  165 + $source = [];
  166 +
  167 + if (!$html) {
  168 + return $source;
  169 + }
  170 +
  171 + //image
  172 + preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
  173 + $img = $result_img[2] ?? [];
  174 + foreach ($img as $vi) {
  175 + $check_vi = $this->url_check($vi, $project_id, $domain);
  176 + $check_vi && $source[] = $check_vi;
  177 + }
  178 +
  179 + //js
  180 + preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
  181 + $js = $result_js[2] ?? [];
  182 + foreach ($js as $vj) {
  183 + $check_vj = $this->url_check($vj, $project_id, $domain);
  184 + $check_vj && $source[] = $check_vj;
  185 + }
  186 +
  187 + //video
  188 + preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
  189 + $video = $result_video[2] ?? [];
  190 + foreach ($video as $vv) {
  191 + $check_vv = $this->url_check($vv, $project_id, $domain);
  192 + $check_vv && $source[] = $check_vv;
  193 + }
  194 +
  195 + //css
  196 + preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
  197 + $css = $result_css[2] ?? [];
  198 + foreach ($css as $vc) {
  199 + $check_vc = $this->url_check($vc, $project_id, $domain);
  200 + $check_vc && $source[] = $check_vc;
  201 + }
  202 +
  203 + //css background
  204 + preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
  205 + $css_b = $result_css_b[1] ?? [];
  206 + foreach ($css_b as $vc_b) {
  207 + $check_vc_b = $this->url_check($vc_b, $project_id, $domain);
  208 + $check_vc_b && $source[] = $check_vc_b;
  209 + }
  210 +
  211 +
  212 + return $source;
  213 + }
  214 +
  215 + //判断资源是否需要下载
  216 + protected function url_check($url, $project_id, $domain)
  217 + {
  218 + if ($url) {
  219 + $arr = parse_url($url);
  220 + $scheme = $arr['scheme'] ?? '';
  221 + $host = $arr['host'] ?? '';
  222 + $path = $arr['path'] ?? '';
  223 +
  224 + if (
  225 + (strpos($host, '.globalso.') === false) &&
  226 + (strpos($host, '.goodao.') === false) &&
  227 + $path && (strpos($path, '.') !== false)
  228 + ) {
  229 +
  230 + $source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
  231 + if (!$source) {
  232 + return [
  233 + 'download' => true,
  234 + 'url' => $url,
  235 + 'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path
  236 + ];
  237 + } else {
  238 + return [
  239 + 'download' => false,
  240 + 'url' => $url,
  241 + 'url_complete' => $source['target']
  242 + ];
  243 + }
  244 + } else {
  245 + return false;
  246 + }
  247 + } else {
  248 + return false;
  249 + }
  250 + }
  251 +
  252 + //下载并替换资源
  253 + protected function upload_source($html, $source, $project_id)
  254 + {
  255 + foreach ($source as $vs) {
  256 +
  257 + if ($vs['download']) {
  258 + $new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);
  259 + if ($new_source) {
  260 + CollectSource::insert([
  261 + 'project_id' => $project_id,
  262 + 'origin' => $vs['url'],
  263 + 'target' => $new_source,
  264 + 'created_at' => date('Y-m-d H:i:s'),
  265 + 'updated_at' => date('Y-m-d H:i:s'),
  266 + ]);
  267 + $html = str_replace($vs['url'], getImageUrl($new_source), $html);
  268 +
  269 + if (substr($new_source, -3, 3) == 'css') {
  270 + // 下载css文件中的资源
  271 + $css_html = file_get_contents($vs['url_complete']);
  272 + preg_match_all("/url\(['\"](\s*[^>]+?)['\"]\)/i", $css_html, $result_css_source);
  273 + $css_source = $result_css_source[1] ?? [];
  274 +
  275 + $url_arr = explode('/', $vs['url_complete']);
  276 + $target_arr = explode('/', $new_source);
  277 + foreach ($css_source as $vcs) {
  278 + $vcs_arr = parse_url($vcs);
  279 + if (isset($vcs_arr['domain'])) {
  280 + //不是相对路径,不下载
  281 + continue;
  282 + }
  283 +
  284 + $vcs = $vcs_arr['path'] ?? '';
  285 + if (!$vcs) {
  286 + continue;
  287 + }
  288 +
  289 + $source_info = CollectSource::where('project_id', $project_id)->where('origin', $vcs)->first();
  290 + if ($source_info) {
  291 + //已存在,不下载
  292 + continue;
  293 + }
  294 +
  295 + $url_arr[count($url_arr) - 1] = $vcs;
  296 + $url_css_complete = implode('/', $url_arr);
  297 + $target_arr[count($target_arr) - 1] = $vcs;
  298 + $path = implode('/', $target_arr);
  299 +
  300 + $new_source_css = CosService::uploadRemote($project_id, 'source', $url_css_complete, $path);
  301 + if ($new_source_css) {
  302 + CollectSource::insert([
  303 + 'project_id' => $project_id,
  304 + 'origin' => $vcs,
  305 + 'target' => $new_source_css,
  306 + 'created_at' => date('Y-m-d H:i:s'),
  307 + 'updated_at' => date('Y-m-d H:i:s'),
  308 + ]);
  309 + }
  310 + }
  311 + }
  312 + }
  313 + } else {
  314 + $html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);
  315 + }
  316 + }
  317 +
  318 + return $html;
  319 + }
  320 +}
@@ -14,7 +14,8 @@ class UpdateLog extends Model @@ -14,7 +14,8 @@ class UpdateLog extends Model
14 const STATUS_COM = 2;//导入完成 14 const STATUS_COM = 2;//导入完成
15 15
16 const COLLECT_STATUS_UN = 0;//未开始 16 const COLLECT_STATUS_UN = 0;//未开始
17 - const COLLECT_STATUS_COM = 1;//采集完成 17 + const COLLECT_STATUS_COM = 1;//全站小语种采集完成
  18 + const COLLECT_STATUS_MAIN = 2;//英语主站采集完成
18 19
19 /** 20 /**
20 * 创建更新日志 21 * 创建更新日志