|
|
|
<?php
|
|
|
|
|
|
|
|
namespace App\Console\Commands\Update;
|
|
|
|
|
|
|
|
use App\Models\Collect\CollectSource;
|
|
|
|
use App\Models\Collect\CollectTask;
|
|
|
|
use App\Models\Com\UpdateLog;
|
|
|
|
use App\Models\RouteMap\RouteMap;
|
|
|
|
use App\Services\CosService;
|
|
|
|
use App\Services\ProjectServer;
|
|
|
|
use Illuminate\Console\Command;
|
|
|
|
use Illuminate\Support\Facades\DB;
|
|
|
|
use Illuminate\Support\Facades\Redis;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* 4.0,5.0升级到6.0,小语种页面采集
|
|
|
|
* Class ProjectImport
|
|
|
|
* @package App\Console\Commands
|
|
|
|
* @author Akun
|
|
|
|
* @date 2023/11/20 14:04
|
|
|
|
*/
|
|
|
|
class HtmlLanguageCollect extends Command
|
|
|
|
{
|
|
|
|
/**
|
|
|
|
* The name and signature of the console command.
|
|
|
|
*
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
protected $signature = 'project_html_language_collect';
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The console command description.
|
|
|
|
*
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
protected $description = '执行项目html页面采集';
|
|
|
|
|
|
|
|
|
|
|
|
public function handle()
|
|
|
|
{
|
|
|
|
while (true) {
|
|
|
|
$this->start_collect();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
protected function start_collect()
|
|
|
|
{
|
|
|
|
$task_id = $this->get_task();
|
|
|
|
if ($task_id === false) {
|
|
|
|
//所有项目采集完成
|
|
|
|
sleep(60);
|
|
|
|
return true;
|
|
|
|
} elseif ($task_id === 0) {
|
|
|
|
//当前项目采集完成
|
|
|
|
sleep(2);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
$task_arr = explode('_', $task_id);
|
|
|
|
$project_id = $task_arr[0];
|
|
|
|
$collect_id = $task_arr[1];
|
|
|
|
|
|
|
|
//设置数据库
|
|
|
|
$project = ProjectServer::useProject($project_id);
|
|
|
|
if ($project) {
|
|
|
|
$collect_info = CollectTask::select(['id', 'domain', 'route'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->first();
|
|
|
|
|
|
|
|
if (!$collect_info) {
|
|
|
|
sleep(2);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect start' . PHP_EOL;
|
|
|
|
|
|
|
|
$collect_info->status = CollectTask::STATUS_ING;
|
|
|
|
$collect_info->save();
|
|
|
|
|
|
|
|
//采集html页面,下载资源到本地并替换
|
|
|
|
try {
|
|
|
|
$html = file_get_contents('https://' . $collect_info->domain . $collect_info->route);
|
|
|
|
$source_list = $this->html_preg($html, $project_id, $collect_info->domain);
|
|
|
|
|
|
|
|
if ($source_list) {
|
|
|
|
$html = $this->upload_source($html, $source_list, $project_id);
|
|
|
|
}
|
|
|
|
} catch (\Exception $e) {
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
$collect_info->html = $html;
|
|
|
|
$collect_info->status = CollectTask::STATUS_COM;
|
|
|
|
$collect_info->save();
|
|
|
|
|
|
|
|
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', collect end' . PHP_EOL;
|
|
|
|
}
|
|
|
|
//关闭数据库
|
|
|
|
DB::disconnect('custom_mysql');
|
|
|
|
|
|
|
|
sleep(2);
|
|
|
|
}
|
|
|
|
|
|
|
|
//获取任务
|
|
|
|
protected function get_task()
|
|
|
|
{
|
|
|
|
$key = 'console_html_language_collect_task';
|
|
|
|
$task_id = Redis::rpop($key);
|
|
|
|
if ($task_id) {
|
|
|
|
return $task_id;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
$update_log = UpdateLog::where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_MAIN)->orderBy('project_id', 'asc')->first();
|
|
|
|
if (!$update_log) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch ($update_log->api_type) {
|
|
|
|
case 'page':
|
|
|
|
$source = RouteMap::SOURCE_PAGE;
|
|
|
|
break;
|
|
|
|
case 'news':
|
|
|
|
$source = RouteMap::SOURCE_NEWS;
|
|
|
|
break;
|
|
|
|
case 'blog':
|
|
|
|
$source = RouteMap::SOURCE_BLOG;
|
|
|
|
break;
|
|
|
|
case 'tag':
|
|
|
|
$source = RouteMap::SOURCE_PRODUCT_KEYWORD;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
$source = RouteMap::SOURCE_PRODUCT;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
$complete = false;
|
|
|
|
//设置数据库
|
|
|
|
$project = ProjectServer::useProject($update_log->project_id);
|
|
|
|
if ($project) {
|
|
|
|
$collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('source', $source)->where('language', '!=', '')->where('status', CollectTask::STATUS_UN)->orderBy('language', 'asc')->limit(50)->get();
|
|
|
|
|
|
|
|
if ($collect_list->count() == 0) {
|
|
|
|
$complete = true;
|
|
|
|
} else {
|
|
|
|
foreach ($collect_list as $collect) {
|
|
|
|
Redis::lpush($key, $collect['project_id'] . '_' . $collect['id']);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
//关闭数据库
|
|
|
|
DB::disconnect('custom_mysql');
|
|
|
|
|
|
|
|
if ($complete) {
|
|
|
|
$update_log->collect_status = UpdateLog::COLLECT_STATUS_COM;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
$task_id = Redis::rpop($key);
|
|
|
|
return $task_id;
|
|
|
|
}
|
|
|
|
|
|
|
|
//正则匹配html资源
|
|
|
|
protected function html_preg($html, $project_id, $domain)
|
|
|
|
{
|
|
|
|
$source = [];
|
|
|
|
|
|
|
|
if (!$html) {
|
|
|
|
return $source;
|
|
|
|
}
|
|
|
|
|
|
|
|
//image
|
|
|
|
preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
|
|
|
|
$img = $result_img[2] ?? [];
|
|
|
|
foreach ($img as $vi) {
|
|
|
|
$check_vi = $this->url_check($vi, $project_id, $domain);
|
|
|
|
$check_vi && $source[] = $check_vi;
|
|
|
|
}
|
|
|
|
|
|
|
|
//js
|
|
|
|
preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
|
|
|
|
$js = $result_js[2] ?? [];
|
|
|
|
foreach ($js as $vj) {
|
|
|
|
$check_vj = $this->url_check($vj, $project_id, $domain);
|
|
|
|
$check_vj && $source[] = $check_vj;
|
|
|
|
}
|
|
|
|
|
|
|
|
//video
|
|
|
|
preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
|
|
|
|
$video = $result_video[2] ?? [];
|
|
|
|
foreach ($video as $vv) {
|
|
|
|
$check_vv = $this->url_check($vv, $project_id, $domain);
|
|
|
|
$check_vv && $source[] = $check_vv;
|
|
|
|
}
|
|
|
|
|
|
|
|
//css
|
|
|
|
preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
|
|
|
|
$css = $result_css[2] ?? [];
|
|
|
|
foreach ($css as $vc) {
|
|
|
|
$check_vc = $this->url_check($vc, $project_id, $domain);
|
|
|
|
$check_vc && $source[] = $check_vc;
|
|
|
|
}
|
|
|
|
|
|
|
|
//css background
|
|
|
|
preg_match_all("/url\(['\"]?(\s*[^>]+?)['\"]?\)/i", $html, $result_css_b);
|
|
|
|
$css_b = $result_css_b[1] ?? [];
|
|
|
|
foreach ($css_b as $vc_b) {
|
|
|
|
$check_vc_b = $this->url_check($vc_b, $project_id, $domain);
|
|
|
|
$check_vc_b && $source[] = $check_vc_b;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return $source;
|
|
|
|
}
|
|
|
|
|
|
|
|
//判断资源是否需要下载
|
|
|
|
protected function url_check($url, $project_id, $domain)
|
|
|
|
{
|
|
|
|
if ($url) {
|
|
|
|
$arr = parse_url($url);
|
|
|
|
$scheme = $arr['scheme'] ?? '';
|
|
|
|
$host = $arr['host'] ?? '';
|
|
|
|
$path = $arr['path'] ?? '';
|
|
|
|
|
|
|
|
if (
|
|
|
|
(strpos($host, '.globalso.') === false) &&
|
|
|
|
(strpos($host, '.goodao.') === false) &&
|
|
|
|
$path && (strpos($path, '.') !== false)
|
|
|
|
) {
|
|
|
|
|
|
|
|
$source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
|
|
|
|
if (!$source) {
|
|
|
|
return [
|
|
|
|
'download' => true,
|
|
|
|
'url' => $url,
|
|
|
|
'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path
|
|
|
|
];
|
|
|
|
} else {
|
|
|
|
return [
|
|
|
|
'download' => false,
|
|
|
|
'url' => $url,
|
|
|
|
'url_complete' => $source['target']
|
|
|
|
];
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//下载并替换资源
|
|
|
|
protected function upload_source($html, $source, $project_id)
|
|
|
|
{
|
|
|
|
foreach ($source as $vs) {
|
|
|
|
|
|
|
|
if ($vs['download']) {
|
|
|
|
$new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);
|
|
|
|
if ($new_source) {
|
|
|
|
CollectSource::insert([
|
|
|
|
'project_id' => $project_id,
|
|
|
|
'origin' => $vs['url'],
|
|
|
|
'target' => $new_source,
|
|
|
|
'created_at' => date('Y-m-d H:i:s'),
|
|
|
|
'updated_at' => date('Y-m-d H:i:s'),
|
|
|
|
]);
|
|
|
|
$html = str_replace($vs['url'], getImageUrl($new_source), $html);
|
|
|
|
|
|
|
|
if (substr($new_source, -3, 3) == 'css') {
|
|
|
|
// 下载css文件中的资源
|
|
|
|
$css_html = file_get_contents($vs['url_complete']);
|
|
|
|
preg_match_all("/url\(['\"](\s*[^>]+?)['\"]\)/i", $css_html, $result_css_source);
|
|
|
|
$css_source = $result_css_source[1] ?? [];
|
|
|
|
|
|
|
|
$url_arr = explode('/', $vs['url_complete']);
|
|
|
|
$target_arr = explode('/', $new_source);
|
|
|
|
foreach ($css_source as $vcs) {
|
|
|
|
$vcs_arr = parse_url($vcs);
|
|
|
|
if (isset($vcs_arr['domain'])) {
|
|
|
|
//不是相对路径,不下载
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$vcs = $vcs_arr['path'] ?? '';
|
|
|
|
if (!$vcs) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$source_info = CollectSource::where('project_id', $project_id)->where('origin', $vcs)->first();
|
|
|
|
if ($source_info) {
|
|
|
|
//已存在,不下载
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$url_arr[count($url_arr) - 1] = $vcs;
|
|
|
|
$url_css_complete = implode('/', $url_arr);
|
|
|
|
$target_arr[count($target_arr) - 1] = $vcs;
|
|
|
|
$path = implode('/', $target_arr);
|
|
|
|
|
|
|
|
$new_source_css = CosService::uploadRemote($project_id, 'source', $url_css_complete, $path);
|
|
|
|
if ($new_source_css) {
|
|
|
|
CollectSource::insert([
|
|
|
|
'project_id' => $project_id,
|
|
|
|
'origin' => $vcs,
|
|
|
|
'target' => $new_source_css,
|
|
|
|
'created_at' => date('Y-m-d H:i:s'),
|
|
|
|
'updated_at' => date('Y-m-d H:i:s'),
|
|
|
|
]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
$html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $html;
|
|
|
|
}
|
|
|
|
} |
...
|
...
|
|