|
@@ -75,10 +75,26 @@ class HtmlLanguageCollect extends Command |
|
@@ -75,10 +75,26 @@ class HtmlLanguageCollect extends Command |
|
75
|
$collect_info->status = CollectTask::STATUS_ING;
|
75
|
$collect_info->status = CollectTask::STATUS_ING;
|
|
76
|
$collect_info->save();
|
76
|
$collect_info->save();
|
|
77
|
|
77
|
|
|
|
|
78
|
+ $web_url_domain = $collect_info->domain;
|
|
|
|
79
|
+ $home_url = $collect_info->domain;
|
|
|
|
80
|
+ $url_web_config = 'https://' . $collect_info->domain . '/wp-content/cache/user_config.text';
|
|
|
|
81
|
+ $data_config = http_get($url_web_config, ['charset' => 'UTF-8']);
|
|
|
|
82
|
+ if ($data_config) {
|
|
|
|
83
|
+ $web_url_arr = parse_url($data_config['web_url_domain']);
|
|
|
|
84
|
+ if (isset($web_url_arr['host'])) {
|
|
|
|
85
|
+ $web_url_domain = $web_url_arr['host'];
|
|
|
|
86
|
+ }
|
|
|
|
87
|
+
|
|
|
|
88
|
+ $home_url_arr = parse_url($data_config['home_url']);
|
|
|
|
89
|
+ if (isset($home_url_arr['host'])) {
|
|
|
|
90
|
+ $home_url = $home_url_arr['host'];
|
|
|
|
91
|
+ }
|
|
|
|
92
|
+ }
|
|
|
|
93
|
+
|
|
78
|
//采集html页面,下载资源到本地并替换
|
94
|
//采集html页面,下载资源到本地并替换
|
|
79
|
try {
|
95
|
try {
|
|
80
|
$html = file_get_contents('https://' . $collect_info->domain . $collect_info->route);
|
96
|
$html = file_get_contents('https://' . $collect_info->domain . $collect_info->route);
|
|
81
|
- $source_list = $this->html_preg($html, $project_id, $collect_info->domain);
|
97
|
+ $source_list = $this->html_preg($html, $project_id, $collect_info->domain, $web_url_domain, $home_url);
|
|
82
|
|
98
|
|
|
83
|
if ($source_list) {
|
99
|
if ($source_list) {
|
|
84
|
$html = $this->upload_source($html, $source_list, $project_id);
|
100
|
$html = $this->upload_source($html, $source_list, $project_id);
|
|
@@ -164,7 +180,7 @@ class HtmlLanguageCollect extends Command |
|
@@ -164,7 +180,7 @@ class HtmlLanguageCollect extends Command |
|
164
|
}
|
180
|
}
|
|
165
|
|
181
|
|
|
166
|
//正则匹配html资源
|
182
|
//正则匹配html资源
|
|
167
|
- protected function html_preg($html, $project_id, $domain)
|
183
|
+ protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
|
|
168
|
{
|
184
|
{
|
|
169
|
$source = [];
|
185
|
$source = [];
|
|
170
|
|
186
|
|
|
@@ -176,7 +192,7 @@ class HtmlLanguageCollect extends Command |
|
@@ -176,7 +192,7 @@ class HtmlLanguageCollect extends Command |
|
176
|
preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
|
192
|
preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
|
|
177
|
$img = $result_img[2] ?? [];
|
193
|
$img = $result_img[2] ?? [];
|
|
178
|
foreach ($img as $vi) {
|
194
|
foreach ($img as $vi) {
|
|
179
|
- $check_vi = $this->url_check($vi, $project_id, $domain);
|
195
|
+ $check_vi = $this->url_check($vi, $project_id, $domain, $web_url_domain, $home_url);
|
|
180
|
$check_vi && $source[] = $check_vi;
|
196
|
$check_vi && $source[] = $check_vi;
|
|
181
|
}
|
197
|
}
|
|
182
|
|
198
|
|
|
@@ -184,7 +200,7 @@ class HtmlLanguageCollect extends Command |
|
@@ -184,7 +200,7 @@ class HtmlLanguageCollect extends Command |
|
184
|
preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
|
200
|
preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
|
|
185
|
$js = $result_js[2] ?? [];
|
201
|
$js = $result_js[2] ?? [];
|
|
186
|
foreach ($js as $vj) {
|
202
|
foreach ($js as $vj) {
|
|
187
|
- $check_vj = $this->url_check($vj, $project_id, $domain);
|
203
|
+ $check_vj = $this->url_check($vj, $project_id, $domain, $web_url_domain, $home_url);
|
|
188
|
$check_vj && $source[] = $check_vj;
|
204
|
$check_vj && $source[] = $check_vj;
|
|
189
|
}
|
205
|
}
|
|
190
|
|
206
|
|
|
@@ -192,7 +208,7 @@ class HtmlLanguageCollect extends Command |
|
@@ -192,7 +208,7 @@ class HtmlLanguageCollect extends Command |
|
192
|
preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
|
208
|
preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
|
|
193
|
$video = $result_video[2] ?? [];
|
209
|
$video = $result_video[2] ?? [];
|
|
194
|
foreach ($video as $vv) {
|
210
|
foreach ($video as $vv) {
|
|
195
|
- $check_vv = $this->url_check($vv, $project_id, $domain);
|
211
|
+ $check_vv = $this->url_check($vv, $project_id, $domain, $web_url_domain, $home_url);
|
|
196
|
$check_vv && $source[] = $check_vv;
|
212
|
$check_vv && $source[] = $check_vv;
|
|
197
|
}
|
213
|
}
|
|
198
|
|
214
|
|
|
@@ -200,7 +216,7 @@ class HtmlLanguageCollect extends Command |
|
@@ -200,7 +216,7 @@ class HtmlLanguageCollect extends Command |
|
200
|
preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
|
216
|
preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
|
|
201
|
$css = $result_css[2] ?? [];
|
217
|
$css = $result_css[2] ?? [];
|
|
202
|
foreach ($css as $vc) {
|
218
|
foreach ($css as $vc) {
|
|
203
|
- $check_vc = $this->url_check($vc, $project_id, $domain);
|
219
|
+ $check_vc = $this->url_check($vc, $project_id, $domain, $web_url_domain, $home_url);
|
|
204
|
$check_vc && $source[] = $check_vc;
|
220
|
$check_vc && $source[] = $check_vc;
|
|
205
|
}
|
221
|
}
|
|
206
|
|
222
|
|
|
@@ -208,7 +224,7 @@ class HtmlLanguageCollect extends Command |
|
@@ -208,7 +224,7 @@ class HtmlLanguageCollect extends Command |
|
208
|
preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
|
224
|
preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
|
|
209
|
$css_b = $result_css_b[1] ?? [];
|
225
|
$css_b = $result_css_b[1] ?? [];
|
|
210
|
foreach ($css_b as $vc_b) {
|
226
|
foreach ($css_b as $vc_b) {
|
|
211
|
- $check_vc_b = $this->url_check($vc_b, $project_id, $domain);
|
227
|
+ $check_vc_b = $this->url_check($vc_b, $project_id, $domain, $web_url_domain, $home_url);
|
|
212
|
$check_vc_b && $source[] = $check_vc_b;
|
228
|
$check_vc_b && $source[] = $check_vc_b;
|
|
213
|
}
|
229
|
}
|
|
214
|
|
230
|
|
|
@@ -217,7 +233,7 @@ class HtmlLanguageCollect extends Command |
|
@@ -217,7 +233,7 @@ class HtmlLanguageCollect extends Command |
|
217
|
}
|
233
|
}
|
|
218
|
|
234
|
|
|
219
|
//判断资源是否需要下载
|
235
|
//判断资源是否需要下载
|
|
220
|
- protected function url_check($url, $project_id, $domain)
|
236
|
+ protected function url_check($url, $project_id, $domain, $web_url_domain, $home_url)
|
|
221
|
{
|
237
|
{
|
|
222
|
if ($url) {
|
238
|
if ($url) {
|
|
223
|
$arr = parse_url($url);
|
239
|
$arr = parse_url($url);
|
|
@@ -226,22 +242,6 @@ class HtmlLanguageCollect extends Command |
|
@@ -226,22 +242,6 @@ class HtmlLanguageCollect extends Command |
|
226
|
$path = $arr['path'] ?? '';
|
242
|
$path = $arr['path'] ?? '';
|
|
227
|
$query = $arr['query'] ?? '';
|
243
|
$query = $arr['query'] ?? '';
|
|
228
|
|
244
|
|
|
229
|
- $web_url_domain = $domain;
|
|
|
|
230
|
- $home_url = $domain;
|
|
|
|
231
|
- $url_web_config = 'https://' . $domain . '/wp-content/cache/user_config.text';
|
|
|
|
232
|
- $data_config = http_get($url_web_config, ['charset' => 'UTF-8']);
|
|
|
|
233
|
- if ($data_config) {
|
|
|
|
234
|
- $web_url_arr = parse_url($data_config['web_url_domain']);
|
|
|
|
235
|
- if (isset($web_url_arr['host'])) {
|
|
|
|
236
|
- $web_url_domain = $web_url_arr['host'];
|
|
|
|
237
|
- }
|
|
|
|
238
|
-
|
|
|
|
239
|
- $home_url_arr = parse_url($data_config['home_url']);
|
|
|
|
240
|
- if (isset($home_url_arr['host'])) {
|
|
|
|
241
|
- $home_url = $home_url_arr['host'];
|
|
|
|
242
|
- }
|
|
|
|
243
|
- }
|
|
|
|
244
|
-
|
|
|
|
245
|
if (empty($host) || $host == $web_url_domain || $host == $home_url) {
|
245
|
if (empty($host) || $host == $web_url_domain || $host == $home_url) {
|
|
246
|
$source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
|
246
|
$source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
|
|
247
|
if (!$source) {
|
247
|
if (!$source) {
|