作者 刘锟

update

... ... @@ -63,7 +63,7 @@ class HtmlLanguageCollect extends Command
//设置数据库
$project = ProjectServer::useProject($project_id);
if ($project) {
$collect_info = CollectTask::select(['id', 'domain', 'route'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->where('language', '!=', '')->first();
$collect_info = CollectTask::select(['id', 'domain', 'route', 'language'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->where('language', '!=', '')->first();
if (!$collect_info) {
sleep(2);
... ... @@ -75,10 +75,18 @@ class HtmlLanguageCollect extends Command
$collect_info->status = CollectTask::STATUS_ING;
$collect_info->save();
$web_url_domain = $collect_info->domain;
$home_url = $collect_info->domain;
$url_web_config = 'https://' . $collect_info->domain . '/wp-content/cache/user_config.text';
$data_config = http_get($url_web_config, ['charset' => 'UTF-8']);
//获取英文站域名
$domain = $collect_info->domain;
if (strpos($domain, '/') !== false) {
$domain = substr($domain, 0, strpos($domain, '/'));
} else {
$domain = str_replace($collect_info->language, 'www', $domain);
}
$web_url_domain = $domain;
$home_url = $domain;
$url_web_config = 'https://' . $domain . '/wp-content/cache/user_config.text';
$data_config = curl_c($url_web_config);
if ($data_config) {
$web_url_arr = parse_url($data_config['web_url_domain'] ?? '');
if (isset($web_url_arr['host'])) {
... ... @@ -93,20 +101,11 @@ class HtmlLanguageCollect extends Command
//采集html页面,下载资源到本地并替换
try {
$opts = [
'http' => [
'header' => 'User-Agent:Mozilla/5.0 (Windows NT 6.2; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0'
],
'ssl' => [
'verify_peer' => false,
'verify_peer_name' => false,
]
];
$html = file_get_contents('https://' . $collect_info->domain . $collect_info->route, false, stream_context_create($opts));
$html = curl_c('https://' . $collect_info->domain . $collect_info->route, false);
$source_list = $this->html_preg($html, $project_id, $collect_info->domain, $web_url_domain, $home_url);
if ($source_list) {
$html = $this->upload_source($html, $source_list, $project_id, $opts);
$html = $this->upload_source($html, $source_list, $project_id);
}
} catch (\Exception $e) {
$collect_info->status = CollectTask::STATUS_FAIL;
... ... @@ -279,7 +278,7 @@ class HtmlLanguageCollect extends Command
}
//下载并替换资源
protected function upload_source($html, $source, $project_id, $opts)
protected function upload_source($html, $source, $project_id)
{
foreach ($source as $vs) {
... ... @@ -297,7 +296,7 @@ class HtmlLanguageCollect extends Command
if (substr($new_source, -3, 3) == 'css') {
// 下载css文件中的资源
$css_html = file_get_contents($vs['url_complete'], false, stream_context_create($opts));
$css_html = curl_c($vs['url_complete'], false);
preg_match_all("/url\(['\"](\s*[^>]+?)['\"]\)/i", $css_html, $result_css_source);
$css_source = $result_css_source[1] ?? [];
... ...
... ... @@ -111,11 +111,12 @@ if (!function_exists('http_get')) {
if (!function_exists('curl_c')) {
/**
* @param $url
* @param $is_array
* @return []
* @author Akun
* @date 2023/11/22 11:33
*/
function curl_c($url){
function curl_c($url,$is_array=true){
$header = array(
'Expect:',
'Content-Type: application/json; charset=utf-8'
... ... @@ -130,11 +131,12 @@ if (!function_exists('curl_c')) {
curl_setopt($ch, CURLOPT_TIMEOUT, 120);
curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_SSLVERSION, 'all');
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
$content = curl_exec($ch);
curl_close($ch);
return json_decode($content, true);
return $is_array ? json_decode($content, true) : $content;
}
}
... ...