|
...
|
...
|
@@ -75,10 +75,26 @@ class HtmlLanguageCollect extends Command |
|
|
|
$collect_info->status = CollectTask::STATUS_ING;
|
|
|
|
$collect_info->save();
|
|
|
|
|
|
|
|
$web_url_domain = $collect_info->domain;
|
|
|
|
$home_url = $collect_info->domain;
|
|
|
|
$url_web_config = 'https://' . $collect_info->domain . '/wp-content/cache/user_config.text';
|
|
|
|
$data_config = http_get($url_web_config, ['charset' => 'UTF-8']);
|
|
|
|
if ($data_config) {
|
|
|
|
$web_url_arr = parse_url($data_config['web_url_domain']);
|
|
|
|
if (isset($web_url_arr['host'])) {
|
|
|
|
$web_url_domain = $web_url_arr['host'];
|
|
|
|
}
|
|
|
|
|
|
|
|
$home_url_arr = parse_url($data_config['home_url']);
|
|
|
|
if (isset($home_url_arr['host'])) {
|
|
|
|
$home_url = $home_url_arr['host'];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//采集html页面,下载资源到本地并替换
|
|
|
|
try {
|
|
|
|
$html = file_get_contents('https://' . $collect_info->domain . $collect_info->route);
|
|
|
|
$source_list = $this->html_preg($html, $project_id, $collect_info->domain);
|
|
|
|
$source_list = $this->html_preg($html, $project_id, $collect_info->domain, $web_url_domain, $home_url);
|
|
|
|
|
|
|
|
if ($source_list) {
|
|
|
|
$html = $this->upload_source($html, $source_list, $project_id);
|
|
...
|
...
|
@@ -164,7 +180,7 @@ class HtmlLanguageCollect extends Command |
|
|
|
}
|
|
|
|
|
|
|
|
//正则匹配html资源
|
|
|
|
protected function html_preg($html, $project_id, $domain)
|
|
|
|
protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
|
|
|
|
{
|
|
|
|
$source = [];
|
|
|
|
|
|
...
|
...
|
@@ -176,7 +192,7 @@ class HtmlLanguageCollect extends Command |
|
|
|
preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
|
|
|
|
$img = $result_img[2] ?? [];
|
|
|
|
foreach ($img as $vi) {
|
|
|
|
$check_vi = $this->url_check($vi, $project_id, $domain);
|
|
|
|
$check_vi = $this->url_check($vi, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
$check_vi && $source[] = $check_vi;
|
|
|
|
}
|
|
|
|
|
|
...
|
...
|
@@ -184,7 +200,7 @@ class HtmlLanguageCollect extends Command |
|
|
|
preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
|
|
|
|
$js = $result_js[2] ?? [];
|
|
|
|
foreach ($js as $vj) {
|
|
|
|
$check_vj = $this->url_check($vj, $project_id, $domain);
|
|
|
|
$check_vj = $this->url_check($vj, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
$check_vj && $source[] = $check_vj;
|
|
|
|
}
|
|
|
|
|
|
...
|
...
|
@@ -192,7 +208,7 @@ class HtmlLanguageCollect extends Command |
|
|
|
preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
|
|
|
|
$video = $result_video[2] ?? [];
|
|
|
|
foreach ($video as $vv) {
|
|
|
|
$check_vv = $this->url_check($vv, $project_id, $domain);
|
|
|
|
$check_vv = $this->url_check($vv, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
$check_vv && $source[] = $check_vv;
|
|
|
|
}
|
|
|
|
|
|
...
|
...
|
@@ -200,7 +216,7 @@ class HtmlLanguageCollect extends Command |
|
|
|
preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
|
|
|
|
$css = $result_css[2] ?? [];
|
|
|
|
foreach ($css as $vc) {
|
|
|
|
$check_vc = $this->url_check($vc, $project_id, $domain);
|
|
|
|
$check_vc = $this->url_check($vc, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
$check_vc && $source[] = $check_vc;
|
|
|
|
}
|
|
|
|
|
|
...
|
...
|
@@ -208,7 +224,7 @@ class HtmlLanguageCollect extends Command |
|
|
|
preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
|
|
|
|
$css_b = $result_css_b[1] ?? [];
|
|
|
|
foreach ($css_b as $vc_b) {
|
|
|
|
$check_vc_b = $this->url_check($vc_b, $project_id, $domain);
|
|
|
|
$check_vc_b = $this->url_check($vc_b, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
$check_vc_b && $source[] = $check_vc_b;
|
|
|
|
}
|
|
|
|
|
|
...
|
...
|
@@ -217,7 +233,7 @@ class HtmlLanguageCollect extends Command |
|
|
|
}
|
|
|
|
|
|
|
|
//判断资源是否需要下载
|
|
|
|
protected function url_check($url, $project_id, $domain)
|
|
|
|
protected function url_check($url, $project_id, $domain, $web_url_domain, $home_url)
|
|
|
|
{
|
|
|
|
if ($url) {
|
|
|
|
$arr = parse_url($url);
|
|
...
|
...
|
@@ -227,11 +243,10 @@ class HtmlLanguageCollect extends Command |
|
|
|
$query = $arr['query'] ?? '';
|
|
|
|
|
|
|
|
if (
|
|
|
|
(strpos($host, '.globalso.') === false) &&
|
|
|
|
(strpos($host, '.goodao.') === false) &&
|
|
|
|
$path && (strpos($path, '.') !== false)
|
|
|
|
(empty($host) || $host == $web_url_domain || $host == $home_url)
|
|
|
|
&& $path
|
|
|
|
&& (strpos($path, '.') !== false)
|
|
|
|
) {
|
|
|
|
|
|
|
|
$source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
|
|
|
|
if (!$source) {
|
|
|
|
return [
|
|
...
|
...
|
@@ -290,6 +305,9 @@ class HtmlLanguageCollect extends Command |
|
|
|
if (!$vcs) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (strpos($vcs, '.') === false) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$source_info = CollectSource::where('project_id', $project_id)->where('origin', $vcs)->first();
|
|
|
|
if ($source_info) {
|
...
|
...
|
|