作者 赵彬吉
... ... @@ -75,10 +75,27 @@ class HtmlCollect extends Command
$collect_info->status = CollectTask::STATUS_ING;
$collect_info->save();
//获取站点正式和测试域名
$web_url_domain = $collect_info->domain;
$home_url = $collect_info->domain;
$url_web_config = 'https://' . $collect_info->domain . '/wp-content/cache/user_config.text';
$data_config = http_get($url_web_config, ['charset' => 'UTF-8']);
if ($data_config) {
$web_url_arr = parse_url($data_config['web_url_domain']);
if (isset($web_url_arr['host'])) {
$web_url_domain = $web_url_arr['host'];
}
$home_url_arr = parse_url($data_config['home_url']);
if (isset($home_url_arr['host'])) {
$home_url = $home_url_arr['host'];
}
}
//采集html页面,下载资源到本地并替换
try {
$html = file_get_contents('https://' . $collect_info->domain . $collect_info->route);
$source_list = $this->html_preg($html, $project_id, $collect_info->domain);
$source_list = $this->html_preg($html, $project_id, $collect_info->domain, $web_url_domain, $home_url);
if ($source_list) {
$html = $this->upload_source($html, $source_list, $project_id);
... ... @@ -164,7 +181,7 @@ class HtmlCollect extends Command
}
//正则匹配html资源
protected function html_preg($html, $project_id, $domain)
protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
{
$source = [];
... ... @@ -176,7 +193,7 @@ class HtmlCollect extends Command
preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
$img = $result_img[2] ?? [];
foreach ($img as $vi) {
$check_vi = $this->url_check($vi, $project_id, $domain);
$check_vi = $this->url_check($vi, $project_id, $domain, $web_url_domain, $home_url);
$check_vi && $source[] = $check_vi;
}
... ... @@ -184,7 +201,7 @@ class HtmlCollect extends Command
preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
$js = $result_js[2] ?? [];
foreach ($js as $vj) {
$check_vj = $this->url_check($vj, $project_id, $domain);
$check_vj = $this->url_check($vj, $project_id, $domain, $web_url_domain, $home_url);
$check_vj && $source[] = $check_vj;
}
... ... @@ -192,7 +209,7 @@ class HtmlCollect extends Command
preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
$video = $result_video[2] ?? [];
foreach ($video as $vv) {
$check_vv = $this->url_check($vv, $project_id, $domain);
$check_vv = $this->url_check($vv, $project_id, $domain, $web_url_domain, $home_url);
$check_vv && $source[] = $check_vv;
}
... ... @@ -200,7 +217,7 @@ class HtmlCollect extends Command
preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
$css = $result_css[2] ?? [];
foreach ($css as $vc) {
$check_vc = $this->url_check($vc, $project_id, $domain);
$check_vc = $this->url_check($vc, $project_id, $domain, $web_url_domain, $home_url);
$check_vc && $source[] = $check_vc;
}
... ... @@ -208,7 +225,7 @@ class HtmlCollect extends Command
preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
$css_b = $result_css_b[1] ?? [];
foreach ($css_b as $vc_b) {
$check_vc_b = $this->url_check($vc_b, $project_id, $domain);
$check_vc_b = $this->url_check($vc_b, $project_id, $domain, $web_url_domain, $home_url);
$check_vc_b && $source[] = $check_vc_b;
}
... ... @@ -217,7 +234,7 @@ class HtmlCollect extends Command
}
//判断资源是否需要下载
protected function url_check($url, $project_id, $domain)
protected function url_check($url, $project_id, $domain, $web_url_domain, $home_url)
{
if ($url) {
$arr = parse_url($url);
... ... @@ -227,11 +244,10 @@ class HtmlCollect extends Command
$query = $arr['query'] ?? '';
if (
(strpos($host, '.globalso.') === false) &&
(strpos($host, '.goodao.') === false) &&
$path && (strpos($path, '.') !== false)
(empty($host) || $host == $web_url_domain || $host == $home_url)
&& $path
&& (strpos($path, '.') !== false)
) {
$source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
if (!$source) {
return [
... ... @@ -290,6 +306,9 @@ class HtmlCollect extends Command
if (!$vcs) {
continue;
}
if (strpos($vcs, '.') === false) {
continue;
}
$source_info = CollectSource::where('project_id', $project_id)->where('origin', $vcs)->first();
if ($source_info) {
... ...
... ... @@ -75,10 +75,26 @@ class HtmlLanguageCollect extends Command
$collect_info->status = CollectTask::STATUS_ING;
$collect_info->save();
$web_url_domain = $collect_info->domain;
$home_url = $collect_info->domain;
$url_web_config = 'https://' . $collect_info->domain . '/wp-content/cache/user_config.text';
$data_config = http_get($url_web_config, ['charset' => 'UTF-8']);
if ($data_config) {
$web_url_arr = parse_url($data_config['web_url_domain']);
if (isset($web_url_arr['host'])) {
$web_url_domain = $web_url_arr['host'];
}
$home_url_arr = parse_url($data_config['home_url']);
if (isset($home_url_arr['host'])) {
$home_url = $home_url_arr['host'];
}
}
//采集html页面,下载资源到本地并替换
try {
$html = file_get_contents('https://' . $collect_info->domain . $collect_info->route);
$source_list = $this->html_preg($html, $project_id, $collect_info->domain);
$source_list = $this->html_preg($html, $project_id, $collect_info->domain, $web_url_domain, $home_url);
if ($source_list) {
$html = $this->upload_source($html, $source_list, $project_id);
... ... @@ -164,7 +180,7 @@ class HtmlLanguageCollect extends Command
}
//正则匹配html资源
protected function html_preg($html, $project_id, $domain)
protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
{
$source = [];
... ... @@ -176,7 +192,7 @@ class HtmlLanguageCollect extends Command
preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
$img = $result_img[2] ?? [];
foreach ($img as $vi) {
$check_vi = $this->url_check($vi, $project_id, $domain);
$check_vi = $this->url_check($vi, $project_id, $domain, $web_url_domain, $home_url);
$check_vi && $source[] = $check_vi;
}
... ... @@ -184,7 +200,7 @@ class HtmlLanguageCollect extends Command
preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
$js = $result_js[2] ?? [];
foreach ($js as $vj) {
$check_vj = $this->url_check($vj, $project_id, $domain);
$check_vj = $this->url_check($vj, $project_id, $domain, $web_url_domain, $home_url);
$check_vj && $source[] = $check_vj;
}
... ... @@ -192,7 +208,7 @@ class HtmlLanguageCollect extends Command
preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
$video = $result_video[2] ?? [];
foreach ($video as $vv) {
$check_vv = $this->url_check($vv, $project_id, $domain);
$check_vv = $this->url_check($vv, $project_id, $domain, $web_url_domain, $home_url);
$check_vv && $source[] = $check_vv;
}
... ... @@ -200,7 +216,7 @@ class HtmlLanguageCollect extends Command
preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
$css = $result_css[2] ?? [];
foreach ($css as $vc) {
$check_vc = $this->url_check($vc, $project_id, $domain);
$check_vc = $this->url_check($vc, $project_id, $domain, $web_url_domain, $home_url);
$check_vc && $source[] = $check_vc;
}
... ... @@ -208,7 +224,7 @@ class HtmlLanguageCollect extends Command
preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
$css_b = $result_css_b[1] ?? [];
foreach ($css_b as $vc_b) {
$check_vc_b = $this->url_check($vc_b, $project_id, $domain);
$check_vc_b = $this->url_check($vc_b, $project_id, $domain, $web_url_domain, $home_url);
$check_vc_b && $source[] = $check_vc_b;
}
... ... @@ -217,7 +233,7 @@ class HtmlLanguageCollect extends Command
}
//判断资源是否需要下载
protected function url_check($url, $project_id, $domain)
protected function url_check($url, $project_id, $domain, $web_url_domain, $home_url)
{
if ($url) {
$arr = parse_url($url);
... ... @@ -227,11 +243,10 @@ class HtmlLanguageCollect extends Command
$query = $arr['query'] ?? '';
if (
(strpos($host, '.globalso.') === false) &&
(strpos($host, '.goodao.') === false) &&
$path && (strpos($path, '.') !== false)
(empty($host) || $host == $web_url_domain || $host == $home_url)
&& $path
&& (strpos($path, '.') !== false)
) {
$source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
if (!$source) {
return [
... ... @@ -290,6 +305,9 @@ class HtmlLanguageCollect extends Command
if (!$vcs) {
continue;
}
if (strpos($vcs, '.') === false) {
continue;
}
$source_info = CollectSource::where('project_id', $project_id)->where('origin', $vcs)->first();
if ($source_info) {
... ...
... ... @@ -104,7 +104,11 @@ class CosService
'verify_peer_name' => false,
]
];
try {
$body = file_get_contents($file_url,false,stream_context_create($opts));
}catch (\Exception $e){
return '';
}
try {
$cosClient->putObject([
... ...