|
1
|
-<?php
|
|
|
|
2
|
-
|
|
|
|
3
|
-namespace App\Console\Commands\Update;
|
|
|
|
4
|
-
|
|
|
|
5
|
-use App\Models\Collect\CollectSource;
|
|
|
|
6
|
-use App\Models\Collect\CollectTask;
|
|
|
|
7
|
-use App\Models\Com\UpdateLog;
|
|
|
|
8
|
-use App\Models\Com\UpdateOldInfo;
|
|
|
|
9
|
-use App\Models\RouteMap\RouteMap;
|
|
|
|
10
|
-use App\Services\CosService;
|
|
|
|
11
|
-use App\Services\ProjectServer;
|
|
|
|
12
|
-use Illuminate\Console\Command;
|
|
|
|
13
|
-use Illuminate\Support\Facades\DB;
|
|
|
|
14
|
-use Illuminate\Support\Facades\Redis;
|
|
|
|
15
|
-
|
|
|
|
16
|
-/**
|
|
|
|
17
|
- * 4.0,5.0升级到6.0,主站页面采集
|
|
|
|
18
|
- * Class ProjectImport
|
|
|
|
19
|
- * @package App\Console\Commands
|
|
|
|
20
|
- * @author Akun
|
|
|
|
21
|
- * @date 2023/11/10 16:04
|
|
|
|
22
|
- */
|
|
|
|
23
|
-class HtmlCollectNew extends Command
|
|
|
|
24
|
-{
|
|
|
|
25
|
- /**
|
|
|
|
26
|
- * The name and signature of the console command.
|
|
|
|
27
|
- *
|
|
|
|
28
|
- * @var string
|
|
|
|
29
|
- */
|
|
|
|
30
|
- protected $signature = 'project_html_collect_new';
|
|
|
|
31
|
-
|
|
|
|
32
|
- /**
|
|
|
|
33
|
- * The console command description.
|
|
|
|
34
|
- *
|
|
|
|
35
|
- * @var string
|
|
|
|
36
|
- */
|
|
|
|
37
|
- protected $description = '执行项目html页面采集';
|
|
|
|
38
|
-
|
|
|
|
39
|
-
|
|
|
|
40
|
- public function handle()
|
|
|
|
41
|
- {
|
|
|
|
42
|
- ini_set('memory_limit', '512M');
|
|
|
|
43
|
- while (true) {
|
|
|
|
44
|
- $this->start_collect();
|
|
|
|
45
|
- }
|
|
|
|
46
|
- }
|
|
|
|
47
|
-
|
|
|
|
48
|
- protected function start_collect()
|
|
|
|
49
|
- {
|
|
|
|
50
|
- $task_id = $this->get_task();
|
|
|
|
51
|
- if ($task_id === false) {
|
|
|
|
52
|
- //所有项目采集完成
|
|
|
|
53
|
- sleep(60);
|
|
|
|
54
|
- return true;
|
|
|
|
55
|
- } elseif ($task_id === 0) {
|
|
|
|
56
|
- //当前项目采集完成
|
|
|
|
57
|
- sleep(2);
|
|
|
|
58
|
- return true;
|
|
|
|
59
|
- }
|
|
|
|
60
|
-
|
|
|
|
61
|
- $task_arr = explode('_', $task_id);
|
|
|
|
62
|
- $project_id = $task_arr[0];
|
|
|
|
63
|
- $collect_id = $task_arr[1];
|
|
|
|
64
|
-
|
|
|
|
65
|
- //设置数据库
|
|
|
|
66
|
- $project = ProjectServer::useProject($project_id);
|
|
|
|
67
|
- if ($project) {
|
|
|
|
68
|
- $collect_info = CollectTask::select(['id', 'domain', 'route'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->where('language', '')->first();
|
|
|
|
69
|
-
|
|
|
|
70
|
- if (!$collect_info) {
|
|
|
|
71
|
- sleep(2);
|
|
|
|
72
|
- return true;
|
|
|
|
73
|
- }
|
|
|
|
74
|
-
|
|
|
|
75
|
- echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect start' . PHP_EOL;
|
|
|
|
76
|
-
|
|
|
|
77
|
- $collect_info->status = CollectTask::STATUS_ING;
|
|
|
|
78
|
- $collect_info->save();
|
|
|
|
79
|
-
|
|
|
|
80
|
- //获取站点原始域名信息
|
|
|
|
81
|
- $old_info = UpdateOldInfo::getOldDomain($project_id, $collect_info->domain);
|
|
|
|
82
|
-
|
|
|
|
83
|
- //采集html页面,下载资源到本地并替换
|
|
|
|
84
|
- try {
|
|
|
|
85
|
- $html = curl_c('https://' . $collect_info->domain . $collect_info->route, false);
|
|
|
|
86
|
- if ($html == '0') {
|
|
|
|
87
|
- $collect_info->status = CollectTask::STATUS_FAIL;
|
|
|
|
88
|
- $collect_info->save();
|
|
|
|
89
|
- echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: no html' . PHP_EOL;
|
|
|
|
90
|
- sleep(2);
|
|
|
|
91
|
- return true;
|
|
|
|
92
|
- }
|
|
|
|
93
|
-
|
|
|
|
94
|
- //如果有base64图片,先替换掉,再进行资源匹配
|
|
|
|
95
|
- $new_html = $html;
|
|
|
|
96
|
- preg_match_all("/data:([^;]*);base64,(.*)?\"/", $new_html, $result_img);
|
|
|
|
97
|
- $img_base64 = $result_img[2] ?? [];
|
|
|
|
98
|
- foreach ($img_base64 as $v64) {
|
|
|
|
99
|
- $new_html = str_replace($v64, '', $new_html);
|
|
|
|
100
|
- }
|
|
|
|
101
|
-
|
|
|
|
102
|
- $source_list = $this->html_preg($new_html, $project_id, $collect_info->domain, $old_info['web_url_domain'], $old_info['home_url']);
|
|
|
|
103
|
-
|
|
|
|
104
|
- if ($source_list) {
|
|
|
|
105
|
- $html = $this->upload_source($html, $source_list, $project_id, $collect_info->domain, $old_info['web_url_domain'], $old_info['home_url']);
|
|
|
|
106
|
- }
|
|
|
|
107
|
- } catch (\Exception $e) {
|
|
|
|
108
|
- $collect_info->status = CollectTask::STATUS_FAIL;
|
|
|
|
109
|
- $collect_info->save();
|
|
|
|
110
|
-
|
|
|
|
111
|
- echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;
|
|
|
|
112
|
- sleep(2);
|
|
|
|
113
|
- return true;
|
|
|
|
114
|
- }
|
|
|
|
115
|
-
|
|
|
|
116
|
- $collect_info->html = $html;
|
|
|
|
117
|
- $collect_info->status = CollectTask::STATUS_COM;
|
|
|
|
118
|
- $collect_info->save();
|
|
|
|
119
|
-
|
|
|
|
120
|
- echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect end' . PHP_EOL;
|
|
|
|
121
|
- }
|
|
|
|
122
|
- //关闭数据库
|
|
|
|
123
|
- DB::disconnect('custom_mysql');
|
|
|
|
124
|
-
|
|
|
|
125
|
- sleep(2);
|
|
|
|
126
|
- return true;
|
|
|
|
127
|
- }
|
|
|
|
128
|
-
|
|
|
|
129
|
- //获取任务
|
|
|
|
130
|
- protected function get_task()
|
|
|
|
131
|
- {
|
|
|
|
132
|
- $key = 'console_html_collect_new_task';
|
|
|
|
133
|
- $task_id = Redis::rpop($key);
|
|
|
|
134
|
- if ($task_id) {
|
|
|
|
135
|
- return $task_id;
|
|
|
|
136
|
- }
|
|
|
|
137
|
-
|
|
|
|
138
|
-
|
|
|
|
139
|
- $update_log = UpdateLog::where('project_id', '>=', 799)->where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_UN)->orderBy('project_id', 'asc')->first();
|
|
|
|
140
|
- if (!$update_log) {
|
|
|
|
141
|
- return false;
|
|
|
|
142
|
- }
|
|
|
|
143
|
-
|
|
|
|
144
|
- switch ($update_log->api_type) {
|
|
|
|
145
|
- case 'page':
|
|
|
|
146
|
- $source = RouteMap::SOURCE_PAGE;
|
|
|
|
147
|
- break;
|
|
|
|
148
|
- case 'news':
|
|
|
|
149
|
- $source = RouteMap::SOURCE_NEWS;
|
|
|
|
150
|
- break;
|
|
|
|
151
|
- case 'blog':
|
|
|
|
152
|
- $source = RouteMap::SOURCE_BLOG;
|
|
|
|
153
|
- break;
|
|
|
|
154
|
- default:
|
|
|
|
155
|
- $source = RouteMap::SOURCE_PRODUCT;
|
|
|
|
156
|
- break;
|
|
|
|
157
|
- }
|
|
|
|
158
|
-
|
|
|
|
159
|
- $complete = false;
|
|
|
|
160
|
- //设置数据库
|
|
|
|
161
|
- $project = ProjectServer::useProject($update_log->project_id);
|
|
|
|
162
|
- if ($project) {
|
|
|
|
163
|
- $collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('source', $source)->where('language', '')->where('status', CollectTask::STATUS_UN)->orderBy('id', 'asc')->limit(50)->get();
|
|
|
|
164
|
-
|
|
|
|
165
|
- if ($collect_list->count() == 0) {
|
|
|
|
166
|
- $complete = true;
|
|
|
|
167
|
- } else {
|
|
|
|
168
|
- foreach ($collect_list as $collect) {
|
|
|
|
169
|
- Redis::lpush($key, $collect['project_id'] . '_' . $collect['id']);
|
|
|
|
170
|
- }
|
|
|
|
171
|
- }
|
|
|
|
172
|
- }
|
|
|
|
173
|
- //关闭数据库
|
|
|
|
174
|
- DB::disconnect('custom_mysql');
|
|
|
|
175
|
-
|
|
|
|
176
|
- if ($complete) {
|
|
|
|
177
|
- $update_log->collect_status = UpdateLog::COLLECT_STATUS_MAIN;
|
|
|
|
178
|
- $update_log->save();
|
|
|
|
179
|
- return 0;
|
|
|
|
180
|
- }
|
|
|
|
181
|
-
|
|
|
|
182
|
- $task_id = Redis::rpop($key);
|
|
|
|
183
|
- return $task_id;
|
|
|
|
184
|
- }
|
|
|
|
185
|
-
|
|
|
|
186
|
- //正则匹配html资源
|
|
|
|
187
|
- protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
|
|
|
|
188
|
- {
|
|
|
|
189
|
- $source = [];
|
|
|
|
190
|
-
|
|
|
|
191
|
- if (!$html) {
|
|
|
|
192
|
- return $source;
|
|
|
|
193
|
- }
|
|
|
|
194
|
-
|
|
|
|
195
|
- //image
|
|
|
|
196
|
- preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
|
|
|
|
197
|
- $img = $result_img[2] ?? [];
|
|
|
|
198
|
- foreach ($img as $vi) {
|
|
|
|
199
|
- $check_vi = $this->url_check($vi, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
200
|
- $check_vi && $source[] = $check_vi;
|
|
|
|
201
|
- }
|
|
|
|
202
|
-
|
|
|
|
203
|
- //js
|
|
|
|
204
|
- preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
|
|
|
|
205
|
- $js = $result_js[2] ?? [];
|
|
|
|
206
|
- foreach ($js as $vj) {
|
|
|
|
207
|
- $check_vj = $this->url_check($vj, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
208
|
- $check_vj && $source[] = $check_vj;
|
|
|
|
209
|
- }
|
|
|
|
210
|
-
|
|
|
|
211
|
- //video
|
|
|
|
212
|
- preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
|
|
|
|
213
|
- $video = $result_video[2] ?? [];
|
|
|
|
214
|
- foreach ($video as $vv) {
|
|
|
|
215
|
- $check_vv = $this->url_check($vv, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
216
|
- $check_vv && $source[] = $check_vv;
|
|
|
|
217
|
- }
|
|
|
|
218
|
-
|
|
|
|
219
|
- //css
|
|
|
|
220
|
- preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
|
|
|
|
221
|
- $css = $result_css[2] ?? [];
|
|
|
|
222
|
- foreach ($css as $vc) {
|
|
|
|
223
|
- $check_vc = $this->url_check($vc, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
224
|
- $check_vc && $source[] = $check_vc;
|
|
|
|
225
|
- }
|
|
|
|
226
|
-
|
|
|
|
227
|
- //css background
|
|
|
|
228
|
- preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
|
|
|
|
229
|
- $css_b = $result_css_b[1] ?? [];
|
|
|
|
230
|
- foreach ($css_b as $vc_b) {
|
|
|
|
231
|
- $check_vc_b = $this->url_check($vc_b, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
232
|
- $check_vc_b && $source[] = $check_vc_b;
|
|
|
|
233
|
- }
|
|
|
|
234
|
-
|
|
|
|
235
|
- //a标签下载资源
|
|
|
|
236
|
- preg_match_all('/<a\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_a);
|
|
|
|
237
|
- $down = $result_a[2] ?? [];
|
|
|
|
238
|
- foreach ($down as $vd) {
|
|
|
|
239
|
- $check_vd = $this->url_check($vd, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
240
|
- $check_vd && $source[] = $check_vd;
|
|
|
|
241
|
- }
|
|
|
|
242
|
-
|
|
|
|
243
|
- return $source;
|
|
|
|
244
|
- }
|
|
|
|
245
|
-
|
|
|
|
246
|
- //判断资源是否需要下载
|
|
|
|
247
|
- protected function url_check($url, $project_id, $domain, $web_url_domain, $home_url)
|
|
|
|
248
|
- {
|
|
|
|
249
|
- if ($url) {
|
|
|
|
250
|
- $url = str_replace('"', '', $url);
|
|
|
|
251
|
- $arr = parse_url($url);
|
|
|
|
252
|
- $scheme = $arr['scheme'] ?? '';
|
|
|
|
253
|
- $host = $arr['host'] ?? '';
|
|
|
|
254
|
- $path = $arr['path'] ?? '';
|
|
|
|
255
|
- $query = $arr['query'] ?? '';
|
|
|
|
256
|
-
|
|
|
|
257
|
- $path_arr = explode('.', $path);
|
|
|
|
258
|
- $path_end = end($path_arr);
|
|
|
|
259
|
- if (
|
|
|
|
260
|
- (empty($scheme) || $scheme == 'https' || $scheme == 'http')
|
|
|
|
261
|
- && (empty($host) || (strpos($web_url_domain, $host) !== false) || (strpos($home_url, $host) !== false))
|
|
|
|
262
|
- && $path
|
|
|
|
263
|
- && (substr($path, 0, 1) == '/')
|
|
|
|
264
|
- && (strpos($path, '.') !== false)
|
|
|
|
265
|
- && (strpos($path_end, 'html') === false)
|
|
|
|
266
|
- && (strpos($path_end, 'php') === false)
|
|
|
|
267
|
- && (strpos($path_end, 'com') === false)
|
|
|
|
268
|
- && (strpos($path_end, 'xml') === false)
|
|
|
|
269
|
- ) {
|
|
|
|
270
|
- $source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
|
|
|
|
271
|
- if (!$source) {
|
|
|
|
272
|
- return [
|
|
|
|
273
|
- 'download' => true,
|
|
|
|
274
|
- 'url' => $url,
|
|
|
|
275
|
- 'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path . ($query ? '?' . $query : '')
|
|
|
|
276
|
- ];
|
|
|
|
277
|
- } else {
|
|
|
|
278
|
- return [
|
|
|
|
279
|
- 'download' => false,
|
|
|
|
280
|
- 'url' => $url,
|
|
|
|
281
|
- 'url_complete' => $source['target']
|
|
|
|
282
|
- ];
|
|
|
|
283
|
- }
|
|
|
|
284
|
- } else {
|
|
|
|
285
|
- return false;
|
|
|
|
286
|
- }
|
|
|
|
287
|
- } else {
|
|
|
|
288
|
- return false;
|
|
|
|
289
|
- }
|
|
|
|
290
|
- }
|
|
|
|
291
|
-
|
|
|
|
292
|
- //下载并替换资源
|
|
|
|
293
|
- protected function upload_source($html, $source, $project_id, $domain, $web_url_domain, $home_url)
|
|
|
|
294
|
- {
|
|
|
|
295
|
- foreach ($source as $vs) {
|
|
|
|
296
|
-
|
|
|
|
297
|
- if ($vs['download']) {
|
|
|
|
298
|
- $new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);
|
|
|
|
299
|
- if ($new_source) {
|
|
|
|
300
|
- CollectSource::insert([
|
|
|
|
301
|
- 'project_id' => $project_id,
|
|
|
|
302
|
- 'origin' => $vs['url'],
|
|
|
|
303
|
- 'target' => $new_source,
|
|
|
|
304
|
- 'created_at' => date('Y-m-d H:i:s'),
|
|
|
|
305
|
- 'updated_at' => date('Y-m-d H:i:s'),
|
|
|
|
306
|
- ]);
|
|
|
|
307
|
- $html = str_replace($vs['url'], getImageUrl($new_source), $html);
|
|
|
|
308
|
-
|
|
|
|
309
|
- if (substr($new_source, -3, 3) == 'css' || substr($new_source, -2, 2) == 'js') {
|
|
|
|
310
|
-
|
|
|
|
311
|
- $source_html = curl_c(getImageUrl($new_source), false);
|
|
|
|
312
|
-
|
|
|
|
313
|
- if (substr($new_source, -3, 3) == 'css') {
|
|
|
|
314
|
- preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $source_html, $result_source);
|
|
|
|
315
|
- } else {
|
|
|
|
316
|
- preg_match_all("/[large|thumb]+URL:['\"]+(\s*[^>]+?)['\"]+,/i", $source_html, $result_source);
|
|
|
|
317
|
- }
|
|
|
|
318
|
-
|
|
|
|
319
|
- $js_css_source = $result_source[1] ?? [];
|
|
|
|
320
|
- if ($js_css_source) {
|
|
|
|
321
|
- foreach ($js_css_source as $vjs) {
|
|
|
|
322
|
- if (strpos($vjs, 'URL:"') !== false) {
|
|
|
|
323
|
- $vjs = substr($vjs, strpos($vjs, 'URL:"') + 5);
|
|
|
|
324
|
- }
|
|
|
|
325
|
-
|
|
|
|
326
|
- $vjs_down = str_replace('"', '', $vjs);
|
|
|
|
327
|
- if (strpos($vjs_down, 'data:') !== false) {
|
|
|
|
328
|
- //过滤二进制文件
|
|
|
|
329
|
- continue;
|
|
|
|
330
|
- }
|
|
|
|
331
|
- if (strlen($vjs_down) > 255) {
|
|
|
|
332
|
- //过滤太长文件
|
|
|
|
333
|
- continue;
|
|
|
|
334
|
- }
|
|
|
|
335
|
-
|
|
|
|
336
|
- $vjs_down_arr = parse_url($vjs_down);
|
|
|
|
337
|
- $vjs_down_host = $vjs_down_arr['host'] ?? '';
|
|
|
|
338
|
-
|
|
|
|
339
|
- $cos = config('filesystems.disks.cos');
|
|
|
|
340
|
- $cosCdn = $cos['cdn'];
|
|
|
|
341
|
-
|
|
|
|
342
|
- if ($vjs_down_host && $vjs_down_host == $cosCdn) {
|
|
|
|
343
|
- //过滤已经下载的
|
|
|
|
344
|
- continue;
|
|
|
|
345
|
- }
|
|
|
|
346
|
-
|
|
|
|
347
|
- if (empty($vjs_down_host) && substr($vjs_down, 0, 1) != '/') {
|
|
|
|
348
|
- //相对路径
|
|
|
|
349
|
- $url_arr = explode('/', $vs['url']);
|
|
|
|
350
|
- $url_arr[count($url_arr) - 1] = $vjs_down;
|
|
|
|
351
|
- $vjs_down = implode('/', $url_arr);
|
|
|
|
352
|
- }
|
|
|
|
353
|
-
|
|
|
|
354
|
- $vjs_result = $this->url_check($vjs_down, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
355
|
- if (!$vjs_result) {
|
|
|
|
356
|
- continue;
|
|
|
|
357
|
- }
|
|
|
|
358
|
-
|
|
|
|
359
|
- if ($vjs_result['download']) {
|
|
|
|
360
|
- $new_vjs = CosService::uploadRemote($project_id, 'source', $vjs_result['url_complete']);
|
|
|
|
361
|
- if ($new_vjs) {
|
|
|
|
362
|
- CollectSource::insert([
|
|
|
|
363
|
- 'project_id' => $project_id,
|
|
|
|
364
|
- 'origin' => $vjs_result['url'],
|
|
|
|
365
|
- 'target' => $new_vjs,
|
|
|
|
366
|
- 'created_at' => date('Y-m-d H:i:s'),
|
|
|
|
367
|
- 'updated_at' => date('Y-m-d H:i:s'),
|
|
|
|
368
|
- ]);
|
|
|
|
369
|
- $source_html = str_replace($vjs, getImageUrl($new_vjs), $source_html);
|
|
|
|
370
|
- }
|
|
|
|
371
|
- } else {
|
|
|
|
372
|
- $source_html = str_replace($vjs, getImageUrl($vjs_result['url_complete']), $source_html);
|
|
|
|
373
|
- }
|
|
|
|
374
|
- }
|
|
|
|
375
|
-
|
|
|
|
376
|
- CosService::uploadRemote($project_id, 'source', $new_source, $new_source, $source_html);
|
|
|
|
377
|
- }
|
|
|
|
378
|
- }
|
|
|
|
379
|
- }
|
|
|
|
380
|
- } else {
|
|
|
|
381
|
- $html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);
|
|
|
|
382
|
- }
|
|
|
|
383
|
- }
|
|
|
|
384
|
-
|
|
|
|
385
|
- return $html;
|
|
|
|
386
|
- }
|
|
|
|
387
|
-} |
|
|