|
|
|
1
|
+<?php
|
|
|
|
2
|
+
|
|
|
|
3
|
+namespace App\Console\Commands\Update;
|
|
|
|
4
|
+
|
|
|
|
5
|
+use App\Models\Collect\CollectSource;
|
|
|
|
6
|
+use App\Models\Collect\CollectTask;
|
|
|
|
7
|
+use App\Models\Com\UpdateLog;
|
|
|
|
8
|
+use App\Models\RouteMap\RouteMap;
|
|
|
|
9
|
+use App\Services\CosService;
|
|
|
|
10
|
+use App\Services\ProjectServer;
|
|
|
|
11
|
+use Illuminate\Console\Command;
|
|
|
|
12
|
+use Illuminate\Support\Facades\DB;
|
|
|
|
13
|
+use Illuminate\Support\Facades\Redis;
|
|
|
|
14
|
+
|
|
|
|
15
|
+/**
|
|
|
|
16
|
+ * 4.0,5.0升级到6.0,小语种页面采集
|
|
|
|
17
|
+ * Class ProjectImport
|
|
|
|
18
|
+ * @package App\Console\Commands
|
|
|
|
19
|
+ * @author Akun
|
|
|
|
20
|
+ * @date 2023/11/20 14:04
|
|
|
|
21
|
+ */
|
|
|
|
22
|
+class HtmlLanguageCollect extends Command
|
|
|
|
23
|
+{
|
|
|
|
24
|
+ /**
|
|
|
|
25
|
+ * The name and signature of the console command.
|
|
|
|
26
|
+ *
|
|
|
|
27
|
+ * @var string
|
|
|
|
28
|
+ */
|
|
|
|
29
|
+ protected $signature = 'project_html_language_collect';
|
|
|
|
30
|
+
|
|
|
|
31
|
+ /**
|
|
|
|
32
|
+ * The console command description.
|
|
|
|
33
|
+ *
|
|
|
|
34
|
+ * @var string
|
|
|
|
35
|
+ */
|
|
|
|
36
|
+ protected $description = '执行项目html页面采集';
|
|
|
|
37
|
+
|
|
|
|
38
|
+
|
|
|
|
39
|
+ public function handle()
|
|
|
|
40
|
+ {
|
|
|
|
41
|
+ while (true) {
|
|
|
|
42
|
+ $this->start_collect();
|
|
|
|
43
|
+ }
|
|
|
|
44
|
+ }
|
|
|
|
45
|
+
|
|
|
|
46
|
+ protected function start_collect()
|
|
|
|
47
|
+ {
|
|
|
|
48
|
+ $task_id = $this->get_task();
|
|
|
|
49
|
+ if ($task_id === false) {
|
|
|
|
50
|
+ //所有项目采集完成
|
|
|
|
51
|
+ sleep(60);
|
|
|
|
52
|
+ return true;
|
|
|
|
53
|
+ } elseif ($task_id === 0) {
|
|
|
|
54
|
+ //当前项目采集完成
|
|
|
|
55
|
+ sleep(2);
|
|
|
|
56
|
+ return true;
|
|
|
|
57
|
+ }
|
|
|
|
58
|
+
|
|
|
|
59
|
+ $task_arr = explode('_', $task_id);
|
|
|
|
60
|
+ $project_id = $task_arr[0];
|
|
|
|
61
|
+ $collect_id = $task_arr[1];
|
|
|
|
62
|
+
|
|
|
|
63
|
+ //设置数据库
|
|
|
|
64
|
+ $project = ProjectServer::useProject($project_id);
|
|
|
|
65
|
+ if ($project) {
|
|
|
|
66
|
+ $collect_info = CollectTask::select(['id', 'domain', 'route'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->first();
|
|
|
|
67
|
+
|
|
|
|
68
|
+ if (!$collect_info) {
|
|
|
|
69
|
+ sleep(2);
|
|
|
|
70
|
+ return true;
|
|
|
|
71
|
+ }
|
|
|
|
72
|
+
|
|
|
|
73
|
+ echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect start' . PHP_EOL;
|
|
|
|
74
|
+
|
|
|
|
75
|
+ $collect_info->status = CollectTask::STATUS_ING;
|
|
|
|
76
|
+ $collect_info->save();
|
|
|
|
77
|
+
|
|
|
|
78
|
+ //采集html页面,下载资源到本地并替换
|
|
|
|
79
|
+ try {
|
|
|
|
80
|
+ $html = file_get_contents('https://' . $collect_info->domain . $collect_info->route);
|
|
|
|
81
|
+ $source_list = $this->html_preg($html, $project_id, $collect_info->domain);
|
|
|
|
82
|
+
|
|
|
|
83
|
+ if ($source_list) {
|
|
|
|
84
|
+ $html = $this->upload_source($html, $source_list, $project_id);
|
|
|
|
85
|
+ }
|
|
|
|
86
|
+ } catch (\Exception $e) {
|
|
|
|
87
|
+ echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;
|
|
|
|
88
|
+ return true;
|
|
|
|
89
|
+ }
|
|
|
|
90
|
+
|
|
|
|
91
|
+ $collect_info->html = $html;
|
|
|
|
92
|
+ $collect_info->status = CollectTask::STATUS_COM;
|
|
|
|
93
|
+ $collect_info->save();
|
|
|
|
94
|
+
|
|
|
|
95
|
+ echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect end' . PHP_EOL;
|
|
|
|
96
|
+ }
|
|
|
|
97
|
+ //关闭数据库
|
|
|
|
98
|
+ DB::disconnect('custom_mysql');
|
|
|
|
99
|
+
|
|
|
|
100
|
+ sleep(2);
|
|
|
|
101
|
+ }
|
|
|
|
102
|
+
|
|
|
|
103
|
+ //获取任务
|
|
|
|
104
|
+ protected function get_task()
|
|
|
|
105
|
+ {
|
|
|
|
106
|
+ $key = 'console_html_language_collect_task';
|
|
|
|
107
|
+ $task_id = Redis::rpop($key);
|
|
|
|
108
|
+ if ($task_id) {
|
|
|
|
109
|
+ return $task_id;
|
|
|
|
110
|
+ }
|
|
|
|
111
|
+
|
|
|
|
112
|
+
|
|
|
|
113
|
+ $update_log = UpdateLog::where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_MAIN)->orderBy('project_id', 'asc')->first();
|
|
|
|
114
|
+ if (!$update_log) {
|
|
|
|
115
|
+ return false;
|
|
|
|
116
|
+ }
|
|
|
|
117
|
+
|
|
|
|
118
|
+ switch ($update_log->api_type) {
|
|
|
|
119
|
+ case 'page':
|
|
|
|
120
|
+ $source = RouteMap::SOURCE_PAGE;
|
|
|
|
121
|
+ break;
|
|
|
|
122
|
+ case 'news':
|
|
|
|
123
|
+ $source = RouteMap::SOURCE_NEWS;
|
|
|
|
124
|
+ break;
|
|
|
|
125
|
+ case 'blog':
|
|
|
|
126
|
+ $source = RouteMap::SOURCE_BLOG;
|
|
|
|
127
|
+ break;
|
|
|
|
128
|
+ case 'tag':
|
|
|
|
129
|
+ $source = RouteMap::SOURCE_PRODUCT_KEYWORD;
|
|
|
|
130
|
+ break;
|
|
|
|
131
|
+ default:
|
|
|
|
132
|
+ $source = RouteMap::SOURCE_PRODUCT;
|
|
|
|
133
|
+ break;
|
|
|
|
134
|
+ }
|
|
|
|
135
|
+
|
|
|
|
136
|
+ $complete = false;
|
|
|
|
137
|
+ //设置数据库
|
|
|
|
138
|
+ $project = ProjectServer::useProject($update_log->project_id);
|
|
|
|
139
|
+ if ($project) {
|
|
|
|
140
|
+ $collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('source', $source)->where('language', '!=', '')->where('status', CollectTask::STATUS_UN)->orderBy('language', 'asc')->limit(50)->get();
|
|
|
|
141
|
+
|
|
|
|
142
|
+ if ($collect_list->count() == 0) {
|
|
|
|
143
|
+ $complete = true;
|
|
|
|
144
|
+ } else {
|
|
|
|
145
|
+ foreach ($collect_list as $collect) {
|
|
|
|
146
|
+ Redis::lpush($key, $collect['project_id'] . '_' . $collect['id']);
|
|
|
|
147
|
+ }
|
|
|
|
148
|
+ }
|
|
|
|
149
|
+ }
|
|
|
|
150
|
+ //关闭数据库
|
|
|
|
151
|
+ DB::disconnect('custom_mysql');
|
|
|
|
152
|
+
|
|
|
|
153
|
+ if ($complete) {
|
|
|
|
154
|
+ $update_log->collect_status = UpdateLog::COLLECT_STATUS_COM;
|
|
|
|
155
|
+ return 0;
|
|
|
|
156
|
+ }
|
|
|
|
157
|
+
|
|
|
|
158
|
+ $task_id = Redis::rpop($key);
|
|
|
|
159
|
+ return $task_id;
|
|
|
|
160
|
+ }
|
|
|
|
161
|
+
|
|
|
|
162
|
+ //正则匹配html资源
|
|
|
|
163
|
+ protected function html_preg($html, $project_id, $domain)
|
|
|
|
164
|
+ {
|
|
|
|
165
|
+ $source = [];
|
|
|
|
166
|
+
|
|
|
|
167
|
+ if (!$html) {
|
|
|
|
168
|
+ return $source;
|
|
|
|
169
|
+ }
|
|
|
|
170
|
+
|
|
|
|
171
|
+ //image
|
|
|
|
172
|
+ preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
|
|
|
|
173
|
+ $img = $result_img[2] ?? [];
|
|
|
|
174
|
+ foreach ($img as $vi) {
|
|
|
|
175
|
+ $check_vi = $this->url_check($vi, $project_id, $domain);
|
|
|
|
176
|
+ $check_vi && $source[] = $check_vi;
|
|
|
|
177
|
+ }
|
|
|
|
178
|
+
|
|
|
|
179
|
+ //js
|
|
|
|
180
|
+ preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
|
|
|
|
181
|
+ $js = $result_js[2] ?? [];
|
|
|
|
182
|
+ foreach ($js as $vj) {
|
|
|
|
183
|
+ $check_vj = $this->url_check($vj, $project_id, $domain);
|
|
|
|
184
|
+ $check_vj && $source[] = $check_vj;
|
|
|
|
185
|
+ }
|
|
|
|
186
|
+
|
|
|
|
187
|
+ //video
|
|
|
|
188
|
+ preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
|
|
|
|
189
|
+ $video = $result_video[2] ?? [];
|
|
|
|
190
|
+ foreach ($video as $vv) {
|
|
|
|
191
|
+ $check_vv = $this->url_check($vv, $project_id, $domain);
|
|
|
|
192
|
+ $check_vv && $source[] = $check_vv;
|
|
|
|
193
|
+ }
|
|
|
|
194
|
+
|
|
|
|
195
|
+ //css
|
|
|
|
196
|
+ preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
|
|
|
|
197
|
+ $css = $result_css[2] ?? [];
|
|
|
|
198
|
+ foreach ($css as $vc) {
|
|
|
|
199
|
+ $check_vc = $this->url_check($vc, $project_id, $domain);
|
|
|
|
200
|
+ $check_vc && $source[] = $check_vc;
|
|
|
|
201
|
+ }
|
|
|
|
202
|
+
|
|
|
|
203
|
+ //css background
|
|
|
|
204
|
+ preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
|
|
|
|
205
|
+ $css_b = $result_css_b[1] ?? [];
|
|
|
|
206
|
+ foreach ($css_b as $vc_b) {
|
|
|
|
207
|
+ $check_vc_b = $this->url_check($vc_b, $project_id, $domain);
|
|
|
|
208
|
+ $check_vc_b && $source[] = $check_vc_b;
|
|
|
|
209
|
+ }
|
|
|
|
210
|
+
|
|
|
|
211
|
+
|
|
|
|
212
|
+ return $source;
|
|
|
|
213
|
+ }
|
|
|
|
214
|
+
|
|
|
|
215
|
+ //判断资源是否需要下载
|
|
|
|
216
|
+ protected function url_check($url, $project_id, $domain)
|
|
|
|
217
|
+ {
|
|
|
|
218
|
+ if ($url) {
|
|
|
|
219
|
+ $arr = parse_url($url);
|
|
|
|
220
|
+ $scheme = $arr['scheme'] ?? '';
|
|
|
|
221
|
+ $host = $arr['host'] ?? '';
|
|
|
|
222
|
+ $path = $arr['path'] ?? '';
|
|
|
|
223
|
+
|
|
|
|
224
|
+ if (
|
|
|
|
225
|
+ (strpos($host, '.globalso.') === false) &&
|
|
|
|
226
|
+ (strpos($host, '.goodao.') === false) &&
|
|
|
|
227
|
+ $path && (strpos($path, '.') !== false)
|
|
|
|
228
|
+ ) {
|
|
|
|
229
|
+
|
|
|
|
230
|
+ $source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
|
|
|
|
231
|
+ if (!$source) {
|
|
|
|
232
|
+ return [
|
|
|
|
233
|
+ 'download' => true,
|
|
|
|
234
|
+ 'url' => $url,
|
|
|
|
235
|
+ 'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path
|
|
|
|
236
|
+ ];
|
|
|
|
237
|
+ } else {
|
|
|
|
238
|
+ return [
|
|
|
|
239
|
+ 'download' => false,
|
|
|
|
240
|
+ 'url' => $url,
|
|
|
|
241
|
+ 'url_complete' => $source['target']
|
|
|
|
242
|
+ ];
|
|
|
|
243
|
+ }
|
|
|
|
244
|
+ } else {
|
|
|
|
245
|
+ return false;
|
|
|
|
246
|
+ }
|
|
|
|
247
|
+ } else {
|
|
|
|
248
|
+ return false;
|
|
|
|
249
|
+ }
|
|
|
|
250
|
+ }
|
|
|
|
251
|
+
|
|
|
|
252
|
+ //下载并替换资源
|
|
|
|
253
|
+ protected function upload_source($html, $source, $project_id)
|
|
|
|
254
|
+ {
|
|
|
|
255
|
+ foreach ($source as $vs) {
|
|
|
|
256
|
+
|
|
|
|
257
|
+ if ($vs['download']) {
|
|
|
|
258
|
+ $new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);
|
|
|
|
259
|
+ if ($new_source) {
|
|
|
|
260
|
+ CollectSource::insert([
|
|
|
|
261
|
+ 'project_id' => $project_id,
|
|
|
|
262
|
+ 'origin' => $vs['url'],
|
|
|
|
263
|
+ 'target' => $new_source,
|
|
|
|
264
|
+ 'created_at' => date('Y-m-d H:i:s'),
|
|
|
|
265
|
+ 'updated_at' => date('Y-m-d H:i:s'),
|
|
|
|
266
|
+ ]);
|
|
|
|
267
|
+ $html = str_replace($vs['url'], getImageUrl($new_source), $html);
|
|
|
|
268
|
+
|
|
|
|
269
|
+ if (substr($new_source, -3, 3) == 'css') {
|
|
|
|
270
|
+ // 下载css文件中的资源
|
|
|
|
271
|
+ $css_html = file_get_contents($vs['url_complete']);
|
|
|
|
272
|
+ preg_match_all("/url\(['\"](\s*[^>]+?)['\"]\)/i", $css_html, $result_css_source);
|
|
|
|
273
|
+ $css_source = $result_css_source[1] ?? [];
|
|
|
|
274
|
+
|
|
|
|
275
|
+ $url_arr = explode('/', $vs['url_complete']);
|
|
|
|
276
|
+ $target_arr = explode('/', $new_source);
|
|
|
|
277
|
+ foreach ($css_source as $vcs) {
|
|
|
|
278
|
+ $vcs_arr = parse_url($vcs);
|
|
|
|
279
|
+ if (isset($vcs_arr['domain'])) {
|
|
|
|
280
|
+ //不是相对路径,不下载
|
|
|
|
281
|
+ continue;
|
|
|
|
282
|
+ }
|
|
|
|
283
|
+
|
|
|
|
284
|
+ $vcs = $vcs_arr['path'] ?? '';
|
|
|
|
285
|
+ if (!$vcs) {
|
|
|
|
286
|
+ continue;
|
|
|
|
287
|
+ }
|
|
|
|
288
|
+
|
|
|
|
289
|
+ $source_info = CollectSource::where('project_id', $project_id)->where('origin', $vcs)->first();
|
|
|
|
290
|
+ if ($source_info) {
|
|
|
|
291
|
+ //已存在,不下载
|
|
|
|
292
|
+ continue;
|
|
|
|
293
|
+ }
|
|
|
|
294
|
+
|
|
|
|
295
|
+ $url_arr[count($url_arr) - 1] = $vcs;
|
|
|
|
296
|
+ $url_css_complete = implode('/', $url_arr);
|
|
|
|
297
|
+ $target_arr[count($target_arr) - 1] = $vcs;
|
|
|
|
298
|
+ $path = implode('/', $target_arr);
|
|
|
|
299
|
+
|
|
|
|
300
|
+ $new_source_css = CosService::uploadRemote($project_id, 'source', $url_css_complete, $path);
|
|
|
|
301
|
+ if ($new_source_css) {
|
|
|
|
302
|
+ CollectSource::insert([
|
|
|
|
303
|
+ 'project_id' => $project_id,
|
|
|
|
304
|
+ 'origin' => $vcs,
|
|
|
|
305
|
+ 'target' => $new_source_css,
|
|
|
|
306
|
+ 'created_at' => date('Y-m-d H:i:s'),
|
|
|
|
307
|
+ 'updated_at' => date('Y-m-d H:i:s'),
|
|
|
|
308
|
+ ]);
|
|
|
|
309
|
+ }
|
|
|
|
310
|
+ }
|
|
|
|
311
|
+ }
|
|
|
|
312
|
+ }
|
|
|
|
313
|
+ } else {
|
|
|
|
314
|
+ $html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);
|
|
|
|
315
|
+ }
|
|
|
|
316
|
+ }
|
|
|
|
317
|
+
|
|
|
|
318
|
+ return $html;
|
|
|
|
319
|
+ }
|
|
|
|
320
|
+} |