作者 刘锟

update

@@ -83,7 +83,7 @@ class HtmlCollect extends Command @@ -83,7 +83,7 @@ class HtmlCollect extends Command
83 //采集html页面,下载资源到本地并替换 83 //采集html页面,下载资源到本地并替换
84 try { 84 try {
85 $html = curl_c('https://' . $collect_info->domain . $collect_info->route, false); 85 $html = curl_c('https://' . $collect_info->domain . $collect_info->route, false);
86 - if ($html == '0' || strpos($html,'404 Not Found') !== false) { 86 + if ($html == '0' || strpos($html, '404 Not Found') !== false) {
87 $collect_info->status = CollectTask::STATUS_FAIL; 87 $collect_info->status = CollectTask::STATUS_FAIL;
88 $collect_info->save(); 88 $collect_info->save();
89 89
@@ -138,7 +138,7 @@ class HtmlCollect extends Command @@ -138,7 +138,7 @@ class HtmlCollect extends Command
138 } 138 }
139 139
140 140
141 - $update_log = UpdateLog::whereNotIn('project_id', [555, 626])->where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_UN)->orderBy('project_id', 'asc')->first(); 141 + $update_log = UpdateLog::where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_UN)->orderBy('project_id', 'asc')->first();
142 if (!$update_log) { 142 if (!$update_log) {
143 return false; 143 return false;
144 } 144 }
@@ -286,7 +286,7 @@ class HtmlCollect extends Command @@ -286,7 +286,7 @@ class HtmlCollect extends Command
286 return [ 286 return [
287 'download' => true, 287 'download' => true,
288 'url' => $url, 288 'url' => $url,
289 - 'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path . ($query ? '?' . $query : '') 289 + 'url_complete' => ($scheme ?: 'https') . '://' . $domain . $path . ($query ? '?' . $query : '')
290 ]; 290 ];
291 } else { 291 } else {
292 return [ 292 return [
@@ -302,7 +302,7 @@ class HtmlLanguageCollect extends Command @@ -302,7 +302,7 @@ class HtmlLanguageCollect extends Command
302 return [ 302 return [
303 'download' => true, 303 'download' => true,
304 'url' => $url, 304 'url' => $url,
305 - 'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path . ($query ? '?' . $query : '') 305 + 'url_complete' => ($scheme ?: 'https') . '://' . $domain . $path . ($query ? '?' . $query : '')
306 ]; 306 ];
307 } else { 307 } else {
308 return [ 308 return [
1 -<?php  
2 -  
3 -namespace App\Console\Commands\Update;  
4 -  
5 -use App\Models\Collect\CollectSource;  
6 -use App\Models\Collect\CollectTask;  
7 -use App\Models\Com\UpdateLog;  
8 -use App\Models\Com\UpdateOldInfo;  
9 -use App\Models\RouteMap\RouteMap;  
10 -use App\Services\CosService;  
11 -use App\Services\ProjectServer;  
12 -use Illuminate\Console\Command;  
13 -use Illuminate\Support\Facades\Cache;  
14 -use Illuminate\Support\Facades\DB;  
15 -use Illuminate\Support\Facades\Redis;  
16 -  
17 -/**  
18 - * 4.0,5.0升级到6.0,小语种页面采集  
19 - * Class ProjectImport  
20 - * @package App\Console\Commands  
21 - * @author Akun  
22 - * @date 2023/11/20 14:04  
23 - */  
24 -class HtmlLanguageSpecialCollect extends Command  
25 -{  
26 - /**  
27 - * The name and signature of the console command.  
28 - *  
29 - * @var string  
30 - */  
31 - protected $signature = 'project_html_language_special_collect';  
32 -  
33 - /**  
34 - * The console command description.  
35 - *  
36 - * @var string  
37 - */  
38 - protected $description = '执行项目html页面采集';  
39 -  
40 -  
41 - public function handle()  
42 - {  
43 - ini_set('memory_limit', '512M');  
44 - while (true) {  
45 - $this->start_collect();  
46 - }  
47 - }  
48 -  
49 - protected function start_collect()  
50 - {  
51 - $task_id = $this->get_task();  
52 - if ($task_id === false) {  
53 - //所有项目采集完成  
54 - sleep(60);  
55 - return true;  
56 - } elseif ($task_id === 0) {  
57 - //当前项目采集完成  
58 - sleep(2);  
59 - return true;  
60 - }  
61 -  
62 - $task_arr = explode('_', $task_id);  
63 - $project_id = $task_arr[0];  
64 - $collect_id = $task_arr[1];  
65 -  
66 - //设置数据库  
67 - $project = ProjectServer::useProject($project_id);  
68 - if ($project) {  
69 - $collect_info = CollectTask::select(['id', 'domain', 'route', 'language'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->where('language', '=', '')->first();  
70 -  
71 - if (!$collect_info) {  
72 - sleep(2);  
73 - return true;  
74 - }  
75 -  
76 - echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect start' . PHP_EOL;  
77 -  
78 - $collect_info->status = CollectTask::STATUS_ING;  
79 - $collect_info->save();  
80 -  
81 - //获取站点正式和测试域名  
82 - $domain_en = $this->get_domain_en($project_id);  
83 - $old_info = UpdateOldInfo::getOldDomain($project_id, $domain_en);  
84 -  
85 - //采集html页面,下载资源到本地并替换  
86 - try {  
87 - $html = curl_c('https://' . $collect_info->domain . $collect_info->route, false);  
88 - if ($html == '0' || strpos($html,'404 Not Found') !== false) {  
89 - $collect_info->status = CollectTask::STATUS_FAIL;  
90 - $collect_info->save();  
91 -  
92 - $error = $html == '0' ? 'no html' : '404 not found';  
93 - echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: ' . $error . PHP_EOL;  
94 - sleep(2);  
95 - return true;  
96 - }  
97 -  
98 - //如果有base64图片,先替换掉,再进行资源匹配  
99 - $new_html = $html;  
100 - preg_match_all("/data:([^;]*);base64,(.*)?\"/", $new_html, $result_img);  
101 - $img_base64 = $result_img[2] ?? [];  
102 - foreach ($img_base64 as $v64) {  
103 - $new_html = str_replace($v64, '', $new_html);  
104 - }  
105 -  
106 - $source_list = $this->html_preg($new_html, $project_id, $domain_en, $old_info['web_url_domain'], $old_info['home_url']);  
107 -  
108 - if ($source_list) {  
109 - $html = $this->upload_source($html, $source_list, $project_id, $domain_en, $old_info['web_url_domain'], $old_info['home_url']);  
110 - }  
111 - } catch (\Exception $e) {  
112 - $collect_info->status = CollectTask::STATUS_FAIL;  
113 - $collect_info->save();  
114 -  
115 - echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;  
116 - sleep(2);  
117 - return true;  
118 - }  
119 -  
120 - $collect_info->html = $html;  
121 - $collect_info->status = CollectTask::STATUS_COM;  
122 - $collect_info->save();  
123 -  
124 - echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect end' . PHP_EOL;  
125 - }  
126 - //关闭数据库  
127 - DB::disconnect('custom_mysql');  
128 -  
129 - sleep(2);  
130 - return true;  
131 - }  
132 -  
133 - //获取任务  
134 - protected function get_task()  
135 - {  
136 - $key = 'console_html_language_special_collect_task';  
137 - $task_id = Redis::rpop($key);  
138 - if ($task_id) {  
139 - return $task_id;  
140 - }  
141 -  
142 -  
143 - $update_log = UpdateLog::whereIn('project_id', [555, 626])->where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_UN)->first();  
144 - if (!$update_log) {  
145 - return false;  
146 - }  
147 -  
148 - switch ($update_log->api_type) {  
149 - case 'page':  
150 - $source = RouteMap::SOURCE_PAGE;  
151 - break;  
152 - case 'news':  
153 - $source = RouteMap::SOURCE_NEWS;  
154 - break;  
155 - case 'blog':  
156 - $source = RouteMap::SOURCE_BLOG;  
157 - break;  
158 - default:  
159 - $source = RouteMap::SOURCE_PRODUCT;  
160 - break;  
161 - }  
162 -  
163 - $complete = false;  
164 - //设置数据库  
165 - $project = ProjectServer::useProject($update_log->project_id);  
166 - if ($project) {  
167 - $collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('source', $source)->where('language', '=', '')->where('status', CollectTask::STATUS_UN)->orderBy('id', 'asc')->limit(50)->get();  
168 -  
169 - if ($collect_list->count() == 0) {  
170 - $complete = true;  
171 - } else {  
172 - foreach ($collect_list as $collect) {  
173 - Redis::lpush($key, $collect['project_id'] . '_' . $collect['id']);  
174 - }  
175 - }  
176 - }  
177 - //关闭数据库  
178 - DB::disconnect('custom_mysql');  
179 -  
180 - if ($complete) {  
181 - $update_log->collect_status = UpdateLog::COLLECT_STATUS_COM;  
182 - $update_log->save();  
183 - return 0;  
184 - }  
185 -  
186 - $task_id = Redis::rpop($key);  
187 - return $task_id;  
188 - }  
189 -  
190 - //获取英文站域名  
191 - protected function get_domain_en($project_id)  
192 - {  
193 - $key = 'console_html_language_domain_en';  
194 - $domain = Cache::get($key);  
195 - if (!$domain) {  
196 - $domain = CollectTask::where('project_id', $project_id)->where('language', '')->value('domain');  
197 -  
198 - Cache::add($key, $domain, 3600);  
199 - }  
200 -  
201 - return $domain;  
202 - }  
203 -  
204 - //正则匹配html资源  
205 - protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)  
206 - {  
207 - $source = [];  
208 -  
209 - if (!$html) {  
210 - return $source;  
211 - }  
212 -  
213 - //image  
214 - preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);  
215 - $img = $result_img[2] ?? [];  
216 - foreach ($img as $vi) {  
217 - $check_vi = $this->url_check($vi, $project_id, $domain, $web_url_domain, $home_url);  
218 - if ($check_vi && (!in_array($check_vi, $source))) {  
219 - $check_vi && $source[] = $check_vi;  
220 - }  
221 - }  
222 -  
223 - //js  
224 - preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);  
225 - $js = $result_js[2] ?? [];  
226 - foreach ($js as $vj) {  
227 - $check_vj = $this->url_check($vj, $project_id, $domain, $web_url_domain, $home_url);  
228 - if ($check_vj && (!in_array($check_vj, $source))) {  
229 - $check_vj && $source[] = $check_vj;  
230 - }  
231 - }  
232 -  
233 - //video  
234 - preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);  
235 - $video = $result_video[2] ?? [];  
236 - foreach ($video as $vv) {  
237 - $check_vv = $this->url_check($vv, $project_id, $domain, $web_url_domain, $home_url);  
238 - if ($check_vv && (!in_array($check_vv, $source))) {  
239 - $check_vv && $source[] = $check_vv;  
240 - }  
241 - }  
242 -  
243 - //css  
244 - preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);  
245 - $css = $result_css[2] ?? [];  
246 - foreach ($css as $vc) {  
247 - $check_vc = $this->url_check($vc, $project_id, $domain, $web_url_domain, $home_url);  
248 - if ($check_vc && (!in_array($check_vc, $source))) {  
249 - $check_vc && $source[] = $check_vc;  
250 - }  
251 - }  
252 -  
253 - //css background  
254 - preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);  
255 - $css_b = $result_css_b[1] ?? [];  
256 - foreach ($css_b as $vc_b) {  
257 - $check_vc_b = $this->url_check($vc_b, $project_id, $domain, $web_url_domain, $home_url);  
258 - if ($check_vc_b && (!in_array($check_vc_b, $source))) {  
259 - $check_vc_b && $source[] = $check_vc_b;  
260 - }  
261 - }  
262 -  
263 - //a标签下载资源  
264 - preg_match_all('/<a\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_a);  
265 - $down = $result_a[2] ?? [];  
266 - foreach ($down as $vd) {  
267 - $check_vd = $this->url_check($vd, $project_id, $domain, $web_url_domain, $home_url);  
268 - if ($check_vd && (!in_array($check_vd, $source))) {  
269 - $check_vd && $source[] = $check_vd;  
270 - }  
271 - }  
272 -  
273 - return $source;  
274 - }  
275 -  
276 - //判断资源是否需要下载  
277 - protected function url_check($url, $project_id, $domain, $web_url_domain, $home_url)  
278 - {  
279 - if ($url) {  
280 - $url = str_replace('&quot;', '', $url);  
281 - $arr = parse_url($url);  
282 - $scheme = $arr['scheme'] ?? '';  
283 - $host = $arr['host'] ?? '';  
284 - $path = $arr['path'] ?? '';  
285 - $query = $arr['query'] ?? '';  
286 -  
287 - $path_arr = explode('.', $path);  
288 - $path_end = end($path_arr);  
289 - if (  
290 - (empty($scheme) || $scheme == 'https' || $scheme == 'http')  
291 - && (empty($host) || (strpos($web_url_domain, $host) !== false) || (strpos($home_url, $host) !== false))  
292 - && $path  
293 - && (substr($path, 0, 1) == '/')  
294 - && (strpos($path, '.') !== false)  
295 - && (strpos($path_end, 'html') === false)  
296 - && (strpos($path_end, 'php') === false)  
297 - && (strpos($path_end, 'com') === false)  
298 - && (strpos($path_end, 'xml') === false)  
299 - ) {  
300 - $source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();  
301 - if (!$source) {  
302 - $new_url = str_replace($web_url_domain, $home_url, $url);  
303 - $source_new = CollectSource::where('project_id', $project_id)->where('origin', $new_url)->first();  
304 - if (!$source_new) {  
305 - return [  
306 - 'download' => true,  
307 - 'url' => $url,  
308 - 'url_complete' => ($scheme ?: 'https') . '://' . $home_url . $path . ($query ? '?' . $query : '')  
309 - ];  
310 - } else {  
311 - return [  
312 - 'download' => false,  
313 - 'url' => $url,  
314 - 'url_complete' => $source_new['target']  
315 - ];  
316 - }  
317 - } else {  
318 - return [  
319 - 'download' => false,  
320 - 'url' => $url,  
321 - 'url_complete' => $source['target']  
322 - ];  
323 - }  
324 - } else {  
325 - return false;  
326 - }  
327 - } else {  
328 - return false;  
329 - }  
330 - }  
331 -  
332 - //下载并替换资源  
333 - protected function upload_source($html, $source, $project_id, $domain, $web_url_domain, $home_url)  
334 - {  
335 - foreach ($source as $vs) {  
336 -  
337 - if ($vs['download']) {  
338 - $new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);  
339 - if ($new_source) {  
340 - CollectSource::insert([  
341 - 'project_id' => $project_id,  
342 - 'origin' => $vs['url'],  
343 - 'target' => $new_source,  
344 - 'created_at' => date('Y-m-d H:i:s'),  
345 - 'updated_at' => date('Y-m-d H:i:s'),  
346 - ]);  
347 - $html = str_replace($vs['url'], getImageUrl($new_source), $html);  
348 -  
349 - if (substr($new_source, -3, 3) == 'css' || substr($new_source, -2, 2) == 'js') {  
350 -  
351 - $source_html = curl_c(getImageUrl($new_source), false);  
352 -  
353 - if (substr($new_source, -3, 3) == 'css') {  
354 - preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $source_html, $result_source);  
355 - } else {  
356 - preg_match_all("/[large|thumb]+URL:['\"]+(\s*[^>]+?)['\"]+,/i", $source_html, $result_source);  
357 - }  
358 -  
359 - $js_css_source = $result_source[1] ?? [];  
360 - if ($js_css_source) {  
361 - foreach ($js_css_source as $vjs) {  
362 - if (strpos($vjs, 'URL:"') !== false) {  
363 - $vjs = substr($vjs, strpos($vjs, 'URL:"') + 5);  
364 - }  
365 -  
366 - $vjs_down = str_replace('&quot;', '', $vjs);  
367 - if (strpos($vjs_down, 'data:') !== false) {  
368 - //过滤二进制文件  
369 - continue;  
370 - }  
371 - if (strlen($vjs_down) > 255) {  
372 - //过滤太长文件  
373 - continue;  
374 - }  
375 -  
376 - $vjs_down_arr = parse_url($vjs_down);  
377 - $vjs_down_host = $vjs_down_arr['host'] ?? '';  
378 -  
379 - $cos = config('filesystems.disks.cos');  
380 - $cosCdn = $cos['cdn'];  
381 -  
382 - if ($vjs_down_host && $vjs_down_host == $cosCdn) {  
383 - //过滤已经下载的  
384 - continue;  
385 - }  
386 -  
387 - if (empty($vjs_down_host) && substr($vjs_down, 0, 1) != '/') {  
388 - //相对路径  
389 - $url_arr = explode('/', $vs['url']);  
390 - $url_arr[count($url_arr) - 1] = $vjs_down;  
391 - $vjs_down = implode('/', $url_arr);  
392 - }  
393 -  
394 - $vjs_result = $this->url_check($vjs_down, $project_id, $domain, $web_url_domain, $home_url);  
395 - if (!$vjs_result) {  
396 - continue;  
397 - }  
398 -  
399 - if ($vjs_result['download']) {  
400 - $new_vjs = CosService::uploadRemote($project_id, 'source', $vjs_result['url_complete']);  
401 - if ($new_vjs) {  
402 - CollectSource::insert([  
403 - 'project_id' => $project_id,  
404 - 'origin' => $vjs_result['url'],  
405 - 'target' => $new_vjs,  
406 - 'created_at' => date('Y-m-d H:i:s'),  
407 - 'updated_at' => date('Y-m-d H:i:s'),  
408 - ]);  
409 - $source_html = str_replace($vjs, getImageUrl($new_vjs), $source_html);  
410 - }  
411 - } else {  
412 - $source_html = str_replace($vjs, getImageUrl($vjs_result['url_complete']), $source_html);  
413 - }  
414 - }  
415 -  
416 - CosService::uploadRemote($project_id, 'source', $new_source, $new_source, $source_html);  
417 - }  
418 - }  
419 - }  
420 - } else {  
421 - $html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);  
422 - }  
423 - }  
424 -  
425 - return $html;  
426 - }  
427 -}