|
|
|
<?php
|
|
|
|
|
|
|
|
namespace App\Console\Commands\Update;
|
|
|
|
|
|
|
|
use App\Models\Collect\CollectSource;
|
|
|
|
use App\Models\Collect\CollectTask;
|
|
|
|
use App\Models\Com\UpdateLog;
|
|
|
|
use App\Models\Com\UpdateOldInfo;
|
|
|
|
use App\Models\RouteMap\RouteMap;
|
|
|
|
use App\Services\CosService;
|
|
|
|
use App\Services\ProjectServer;
|
|
|
|
use Illuminate\Console\Command;
|
|
|
|
use Illuminate\Support\Facades\Cache;
|
|
|
|
use Illuminate\Support\Facades\DB;
|
|
|
|
use Illuminate\Support\Facades\Redis;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* 4.0,5.0升级到6.0,小语种页面采集
|
|
|
|
* Class ProjectImport
|
|
|
|
* @package App\Console\Commands
|
|
|
|
* @author Akun
|
|
|
|
* @date 2023/11/20 14:04
|
|
|
|
*/
|
|
|
|
class HtmlLanguageSpecialCollect extends Command
|
|
|
|
{
|
|
|
|
/**
|
|
|
|
* The name and signature of the console command.
|
|
|
|
*
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
protected $signature = 'project_html_language_special_collect';
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The console command description.
|
|
|
|
*
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
protected $description = '执行项目html页面采集';
|
|
|
|
|
|
|
|
|
|
|
|
public function handle()
|
|
|
|
{
|
|
|
|
ini_set('memory_limit', '512M');
|
|
|
|
while (true) {
|
|
|
|
$this->start_collect();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
protected function start_collect()
|
|
|
|
{
|
|
|
|
$task_id = $this->get_task();
|
|
|
|
if ($task_id === false) {
|
|
|
|
//所有项目采集完成
|
|
|
|
sleep(60);
|
|
|
|
return true;
|
|
|
|
} elseif ($task_id === 0) {
|
|
|
|
//当前项目采集完成
|
|
|
|
sleep(2);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
$task_arr = explode('_', $task_id);
|
|
|
|
$project_id = $task_arr[0];
|
|
|
|
$collect_id = $task_arr[1];
|
|
|
|
|
|
|
|
//设置数据库
|
|
|
|
$project = ProjectServer::useProject($project_id);
|
|
|
|
if ($project) {
|
|
|
|
$collect_info = CollectTask::select(['id', 'domain', 'route', 'language'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->where('language', '=', '')->first();
|
|
|
|
|
|
|
|
if (!$collect_info) {
|
|
|
|
sleep(2);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect start' . PHP_EOL;
|
|
|
|
|
|
|
|
$collect_info->status = CollectTask::STATUS_ING;
|
|
|
|
$collect_info->save();
|
|
|
|
|
|
|
|
//获取站点正式和测试域名
|
|
|
|
$domain_en = $this->get_domain_en($project_id);
|
|
|
|
$old_info = UpdateOldInfo::getOldDomain($project_id, $domain_en);
|
|
|
|
|
|
|
|
//采集html页面,下载资源到本地并替换
|
|
|
|
try {
|
|
|
|
$html = curl_c('https://' . $collect_info->domain . $collect_info->route, false);
|
|
|
|
if ($html == '0' || strpos($html,'404 Not Found') !== false) {
|
|
|
|
$collect_info->status = CollectTask::STATUS_FAIL;
|
|
|
|
$collect_info->save();
|
|
|
|
|
|
|
|
$error = $html == '0' ? 'no html' : '404 not found';
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: ' . $error . PHP_EOL;
|
|
|
|
sleep(2);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
//如果有base64图片,先替换掉,再进行资源匹配
|
|
|
|
$new_html = $html;
|
|
|
|
preg_match_all("/data:([^;]*);base64,(.*)?\"/", $new_html, $result_img);
|
|
|
|
$img_base64 = $result_img[2] ?? [];
|
|
|
|
foreach ($img_base64 as $v64) {
|
|
|
|
$new_html = str_replace($v64, '', $new_html);
|
|
|
|
}
|
|
|
|
|
|
|
|
$source_list = $this->html_preg($new_html, $project_id, $domain_en, $old_info['web_url_domain'], $old_info['home_url']);
|
|
|
|
|
|
|
|
if ($source_list) {
|
|
|
|
$html = $this->upload_source($html, $source_list, $project_id, $domain_en, $old_info['web_url_domain'], $old_info['home_url']);
|
|
|
|
}
|
|
|
|
} catch (\Exception $e) {
|
|
|
|
$collect_info->status = CollectTask::STATUS_FAIL;
|
|
|
|
$collect_info->save();
|
|
|
|
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;
|
|
|
|
sleep(2);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
$collect_info->html = $html;
|
|
|
|
$collect_info->status = CollectTask::STATUS_COM;
|
|
|
|
$collect_info->save();
|
|
|
|
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect end' . PHP_EOL;
|
|
|
|
}
|
|
|
|
//关闭数据库
|
|
|
|
DB::disconnect('custom_mysql');
|
|
|
|
|
|
|
|
sleep(2);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
//获取任务
|
|
|
|
protected function get_task()
|
|
|
|
{
|
|
|
|
$key = 'console_html_language_special_collect_task';
|
|
|
|
$task_id = Redis::rpop($key);
|
|
|
|
if ($task_id) {
|
|
|
|
return $task_id;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
$update_log = UpdateLog::whereIn('project_id', [555, 626])->where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_UN)->first();
|
|
|
|
if (!$update_log) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch ($update_log->api_type) {
|
|
|
|
case 'page':
|
|
|
|
$source = RouteMap::SOURCE_PAGE;
|
|
|
|
break;
|
|
|
|
case 'news':
|
|
|
|
$source = RouteMap::SOURCE_NEWS;
|
|
|
|
break;
|
|
|
|
case 'blog':
|
|
|
|
$source = RouteMap::SOURCE_BLOG;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
$source = RouteMap::SOURCE_PRODUCT;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
$complete = false;
|
|
|
|
//设置数据库
|
|
|
|
$project = ProjectServer::useProject($update_log->project_id);
|
|
|
|
if ($project) {
|
|
|
|
$collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('source', $source)->where('language', '=', '')->where('status', CollectTask::STATUS_UN)->orderBy('id', 'asc')->limit(50)->get();
|
|
|
|
|
|
|
|
if ($collect_list->count() == 0) {
|
|
|
|
$complete = true;
|
|
|
|
} else {
|
|
|
|
foreach ($collect_list as $collect) {
|
|
|
|
Redis::lpush($key, $collect['project_id'] . '_' . $collect['id']);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
//关闭数据库
|
|
|
|
DB::disconnect('custom_mysql');
|
|
|
|
|
|
|
|
if ($complete) {
|
|
|
|
$update_log->collect_status = UpdateLog::COLLECT_STATUS_COM;
|
|
|
|
$update_log->save();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
$task_id = Redis::rpop($key);
|
|
|
|
return $task_id;
|
|
|
|
}
|
|
|
|
|
|
|
|
//获取英文站域名
|
|
|
|
protected function get_domain_en($project_id)
|
|
|
|
{
|
|
|
|
$key = 'console_html_language_domain_en';
|
|
|
|
$domain = Cache::get($key);
|
|
|
|
if (!$domain) {
|
|
|
|
$domain = CollectTask::where('project_id', $project_id)->where('language', '')->value('domain');
|
|
|
|
|
|
|
|
Cache::add($key, $domain, 3600);
|
|
|
|
}
|
|
|
|
|
|
|
|
return $domain;
|
|
|
|
}
|
|
|
|
|
|
|
|
//正则匹配html资源
|
|
|
|
protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
|
|
|
|
{
|
|
|
|
$source = [];
|
|
|
|
|
|
|
|
if (!$html) {
|
|
|
|
return $source;
|
|
|
|
}
|
|
|
|
|
|
|
|
//image
|
|
|
|
preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
|
|
|
|
$img = $result_img[2] ?? [];
|
|
|
|
foreach ($img as $vi) {
|
|
|
|
$check_vi = $this->url_check($vi, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
if ($check_vi && (!in_array($check_vi, $source))) {
|
|
|
|
$check_vi && $source[] = $check_vi;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//js
|
|
|
|
preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
|
|
|
|
$js = $result_js[2] ?? [];
|
|
|
|
foreach ($js as $vj) {
|
|
|
|
$check_vj = $this->url_check($vj, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
if ($check_vj && (!in_array($check_vj, $source))) {
|
|
|
|
$check_vj && $source[] = $check_vj;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//video
|
|
|
|
preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
|
|
|
|
$video = $result_video[2] ?? [];
|
|
|
|
foreach ($video as $vv) {
|
|
|
|
$check_vv = $this->url_check($vv, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
if ($check_vv && (!in_array($check_vv, $source))) {
|
|
|
|
$check_vv && $source[] = $check_vv;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//css
|
|
|
|
preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
|
|
|
|
$css = $result_css[2] ?? [];
|
|
|
|
foreach ($css as $vc) {
|
|
|
|
$check_vc = $this->url_check($vc, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
if ($check_vc && (!in_array($check_vc, $source))) {
|
|
|
|
$check_vc && $source[] = $check_vc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//css background
|
|
|
|
preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
|
|
|
|
$css_b = $result_css_b[1] ?? [];
|
|
|
|
foreach ($css_b as $vc_b) {
|
|
|
|
$check_vc_b = $this->url_check($vc_b, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
if ($check_vc_b && (!in_array($check_vc_b, $source))) {
|
|
|
|
$check_vc_b && $source[] = $check_vc_b;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//a标签下载资源
|
|
|
|
preg_match_all('/<a\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_a);
|
|
|
|
$down = $result_a[2] ?? [];
|
|
|
|
foreach ($down as $vd) {
|
|
|
|
$check_vd = $this->url_check($vd, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
if ($check_vd && (!in_array($check_vd, $source))) {
|
|
|
|
$check_vd && $source[] = $check_vd;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $source;
|
|
|
|
}
|
|
|
|
|
|
|
|
//判断资源是否需要下载
|
|
|
|
protected function url_check($url, $project_id, $domain, $web_url_domain, $home_url)
|
|
|
|
{
|
|
|
|
if ($url) {
|
|
|
|
$url = str_replace('"', '', $url);
|
|
|
|
$arr = parse_url($url);
|
|
|
|
$scheme = $arr['scheme'] ?? '';
|
|
|
|
$host = $arr['host'] ?? '';
|
|
|
|
$path = $arr['path'] ?? '';
|
|
|
|
$query = $arr['query'] ?? '';
|
|
|
|
|
|
|
|
$path_arr = explode('.', $path);
|
|
|
|
$path_end = end($path_arr);
|
|
|
|
if (
|
|
|
|
(empty($scheme) || $scheme == 'https' || $scheme == 'http')
|
|
|
|
&& (empty($host) || (strpos($web_url_domain, $host) !== false) || (strpos($home_url, $host) !== false))
|
|
|
|
&& $path
|
|
|
|
&& (substr($path, 0, 1) == '/')
|
|
|
|
&& (strpos($path, '.') !== false)
|
|
|
|
&& (strpos($path_end, 'html') === false)
|
|
|
|
&& (strpos($path_end, 'php') === false)
|
|
|
|
&& (strpos($path_end, 'com') === false)
|
|
|
|
&& (strpos($path_end, 'xml') === false)
|
|
|
|
) {
|
|
|
|
$source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
|
|
|
|
if (!$source) {
|
|
|
|
$new_url = str_replace($web_url_domain, $home_url, $url);
|
|
|
|
$source_new = CollectSource::where('project_id', $project_id)->where('origin', $new_url)->first();
|
|
|
|
if (!$source_new) {
|
|
|
|
return [
|
|
|
|
'download' => true,
|
|
|
|
'url' => $url,
|
|
|
|
'url_complete' => ($scheme ?: 'https') . '://' . $home_url . $path . ($query ? '?' . $query : '')
|
|
|
|
];
|
|
|
|
} else {
|
|
|
|
return [
|
|
|
|
'download' => false,
|
|
|
|
'url' => $url,
|
|
|
|
'url_complete' => $source_new['target']
|
|
|
|
];
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return [
|
|
|
|
'download' => false,
|
|
|
|
'url' => $url,
|
|
|
|
'url_complete' => $source['target']
|
|
|
|
];
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//下载并替换资源
|
|
|
|
protected function upload_source($html, $source, $project_id, $domain, $web_url_domain, $home_url)
|
|
|
|
{
|
|
|
|
foreach ($source as $vs) {
|
|
|
|
|
|
|
|
if ($vs['download']) {
|
|
|
|
$new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);
|
|
|
|
if ($new_source) {
|
|
|
|
CollectSource::insert([
|
|
|
|
'project_id' => $project_id,
|
|
|
|
'origin' => $vs['url'],
|
|
|
|
'target' => $new_source,
|
|
|
|
'created_at' => date('Y-m-d H:i:s'),
|
|
|
|
'updated_at' => date('Y-m-d H:i:s'),
|
|
|
|
]);
|
|
|
|
$html = str_replace($vs['url'], getImageUrl($new_source), $html);
|
|
|
|
|
|
|
|
if (substr($new_source, -3, 3) == 'css' || substr($new_source, -2, 2) == 'js') {
|
|
|
|
|
|
|
|
$source_html = curl_c(getImageUrl($new_source), false);
|
|
|
|
|
|
|
|
if (substr($new_source, -3, 3) == 'css') {
|
|
|
|
preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $source_html, $result_source);
|
|
|
|
} else {
|
|
|
|
preg_match_all("/[large|thumb]+URL:['\"]+(\s*[^>]+?)['\"]+,/i", $source_html, $result_source);
|
|
|
|
}
|
|
|
|
|
|
|
|
$js_css_source = $result_source[1] ?? [];
|
|
|
|
if ($js_css_source) {
|
|
|
|
foreach ($js_css_source as $vjs) {
|
|
|
|
if (strpos($vjs, 'URL:"') !== false) {
|
|
|
|
$vjs = substr($vjs, strpos($vjs, 'URL:"') + 5);
|
|
|
|
}
|
|
|
|
|
|
|
|
$vjs_down = str_replace('"', '', $vjs);
|
|
|
|
if (strpos($vjs_down, 'data:') !== false) {
|
|
|
|
//过滤二进制文件
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (strlen($vjs_down) > 255) {
|
|
|
|
//过滤太长文件
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$vjs_down_arr = parse_url($vjs_down);
|
|
|
|
$vjs_down_host = $vjs_down_arr['host'] ?? '';
|
|
|
|
|
|
|
|
$cos = config('filesystems.disks.cos');
|
|
|
|
$cosCdn = $cos['cdn'];
|
|
|
|
|
|
|
|
if ($vjs_down_host && $vjs_down_host == $cosCdn) {
|
|
|
|
//过滤已经下载的
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (empty($vjs_down_host) && substr($vjs_down, 0, 1) != '/') {
|
|
|
|
//相对路径
|
|
|
|
$url_arr = explode('/', $vs['url']);
|
|
|
|
$url_arr[count($url_arr) - 1] = $vjs_down;
|
|
|
|
$vjs_down = implode('/', $url_arr);
|
|
|
|
}
|
|
|
|
|
|
|
|
$vjs_result = $this->url_check($vjs_down, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
if (!$vjs_result) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($vjs_result['download']) {
|
|
|
|
$new_vjs = CosService::uploadRemote($project_id, 'source', $vjs_result['url_complete']);
|
|
|
|
if ($new_vjs) {
|
|
|
|
CollectSource::insert([
|
|
|
|
'project_id' => $project_id,
|
|
|
|
'origin' => $vjs_result['url'],
|
|
|
|
'target' => $new_vjs,
|
|
|
|
'created_at' => date('Y-m-d H:i:s'),
|
|
|
|
'updated_at' => date('Y-m-d H:i:s'),
|
|
|
|
]);
|
|
|
|
$source_html = str_replace($vjs, getImageUrl($new_vjs), $source_html);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
$source_html = str_replace($vjs, getImageUrl($vjs_result['url_complete']), $source_html);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
CosService::uploadRemote($project_id, 'source', $new_source, $new_source, $source_html);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
$html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $html;
|
|
|
|
}
|
|
|
|
} |